library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(tidyr)
library(flextable)
library(officer)
library(tibble)
library(ggplot2)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
library(pheatmap)
library(caret)
## Loading required package: lattice
library(corrplot)
## corrplot 0.94 loaded
library(randomForestSRC)
## 
##  randomForestSRC 3.3.1 
##  
##  Type rfsrc.news() to see new features, changes, and bug fixes. 
## 
library(survival)
## 
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
## 
##     cluster

1. Read and Clean Data

# Read data and rename columns
df <- readxl::read_excel("./data/AI_Tcells_DB.xlsx") 

# Check duplicates
duplicates <- df %>%
  group_by(ID, Time_OS, cGVHD_time, Names) %>%
  filter(n() > 1) %>%
  ungroup()  

# Clean the data 
df_clean <- df %>%
  # Rename columns
  rename(
    Subject_ID = ID,
    Observation_Days_Count = Time_OS,
    cGVHD_Diagnosis_Day = cGVHD_time,
    Cell_Count = Abs_Value
  ) %>%
  mutate(
    # Add flag to indicate which patients experienced cGVHD
    cGVHD_flag = as.numeric(ifelse(!is.na(cGVHD_Diagnosis_Day), 1, 0)),
    Subject_ID = as.factor(Subject_ID),
    
    # Add blood test group 
    Blood_test_day = case_when(
      grepl('ДЕНЬ ХРРТПХ_ПЕР.КРОВЬ', df$Names) ~ "cGVHD",
      grepl('ДЕНЬ ХРРТПХ +', df$Names) ~ paste0(str_extract(df$Names, "(?<=\\+)\\d+(?=_)"), "_cGVHD"),
      TRUE ~ str_extract(df$Names, "(?<=\\+)\\d+(?=_)")
    ),
    
    # Create exact cell name
    Cell_name = str_extract(Names, "(?<=/).*")
  ) %>%
  select(-Names)

unify_cell_name <- function(cell_name, replacements) {
  cell_name_unified <- cell_name
  for (replacement in replacements) {
    old_value <- replacement[1]
    new_value <- replacement[2]
    
    old_value <- str_replace_all(old_value, "([\\+\\*\\?\\|\\(\\)\\[\\]\\{\\}\\^\\$\\.])", "\\\\\\1")
    cell_name_unified <- str_replace_all(cell_name_unified, old_value, new_value)
  }
  return(cell_name_unified)
}

replacements <- list(
  c("PD1", "PD-1"),
  c("СТАР2", "STAR2"),
  c("4", "CD4_"),
  c("CD4_+", "CD4+_"),
  c("8", "CD8_"),
  c("CD8_+", "CD8+_"),
  c("Th", "TH"),
  c("__", "_"),
  c("_ ", "_")
)

# Apply the function to dataframe
df_clean <- df_clean %>%
  mutate(
    Cell_name_unified = Cell_name,
    Cell_name_unified = unify_cell_name(Cell_name, replacements)
    )

# Check for duplicates for control 
duplicates <- df_clean %>%
  group_by(Subject_ID, Observation_Days_Count, cGVHD_Diagnosis_Day, cGVHD_flag, Blood_test_day, Cell_name) %>%
  filter(n() > 1) %>%
  ungroup() 

# Check the unique cells name 
unique_cells_name <- unique(df_clean$Cell_name_unified)
print(unique_cells_name)
##   [1] "CD4_TFH"              "CD4_TH1"              "CD4_TH17"            
##   [4] "CD4_TH17TO1"          "CD4_TH2"              "CD4_TH22"            
##   [7] "CD4+_(IM STAT)"       "CD4+_"                "CD4+_226+"           
##  [10] "CD4+_39+"             "CD4+_DR+"             "CD4+_PD-1+"          
##  [13] "CD4+_PD-1+TIGIT-"     "CD4+_PD-1+TIGIT+"     "CD4+_PD-1-TIGIT-"    
##  [16] "CD4+_PD-1-TIGIT+"     "CD4+_TIGIT+"          "CD4_NV(STAR2)"       
##  [19] "CD4_NV"               "CD4_NV_TH1"           "CD4_NV_TH17"         
##  [22] "CD4_NV_TH17TO1"       "CD4_NV_TH2"           "CD4_NV_TH22"         
##  [25] "CD4_NV+226+"          "CD4_NV+39+"           "CD4_NV+DR+"          
##  [28] "CD4_NV+PD-1+"         "CD4_NV+PD-1+TIGIT-"   "CD4_NV+PD-1+TIGIT+"  
##  [31] "CD4_NV+PD-1-TIGIT-"   "CD4_NV+PD-1-TIGIT+"   "CD4_NV+TIGIT+"       
##  [34] "CD4_ЕМ"               "CD4_ЕМ_TH1"           "CD4_ЕМ_TH17"         
##  [37] "CD4_ЕМ_TH17TO1"       "CD4_ЕМ_TH2"           "CD4_ЕМ_TH22"         
##  [40] "CD4_ЕМ+226+"          "CD4_ЕМ+39+"           "CD4_ЕМ+DR+"          
##  [43] "CD4_ЕМ+PD-1+"         "CD4_ЕМ+PD-1+TIGIT-"   "CD4_ЕМ+PD-1+TIGIT+"  
##  [46] "CD4_ЕМ+PD-1-TIGIT-"   "CD4_ЕМ+PD-1-TIGIT+"   "CD4_ЕМ+TIGIT+"       
##  [49] "CD4_ЕМTM"             "CD4_СМ(STAR2)"        "CD4_СМ"              
##  [52] "CD4_СМ_TH1"           "CD4_СМ_TH17"          "CD4_СМ_TH17TO1"      
##  [55] "CD4_СМ_TH2"           "CD4_СМ_TH22"          "CD4_СМ+226+"         
##  [58] "CD4_СМ+39+"           "CD4_СМ+DR+"           "CD4_СМ+PD-1+"        
##  [61] "CD4_СМ+PD-1+TIGIT-"   "CD4_СМ+PD-1+TIGIT+"   "CD4_СМ+PD-1-TIGIT-"  
##  [64] "CD4_СМ+PD-1-TIGIT+"   "CD4_СМ+TIGIT+"        "CD4_ТM_TH1"          
##  [67] "CD4_ТM_TH17"          "CD4_ТM_TH17TO1"       "CD4_ТM_TH2"          
##  [70] "CD4_ТM_TH22"          "CD4_ТREG(STAR2)"      "CD4_ТREG"            
##  [73] "CD4_ТREG_TH1"         "CD4_ТREG_TH17"        "CD4_ТREG_TH17TO1"    
##  [76] "CD4_ТREG_TH2"         "CD4_ТREG_TH22"        "CD4_ТЕ(STAR2)"       
##  [79] "CD4_ТЕ"               "CD4_ТЕ_TH1"           "CD4_ТЕ_TH17"         
##  [82] "CD4_ТЕ_TH17TO1"       "CD4_ТЕ_TH2"           "CD4_ТЕ_TH22"         
##  [85] "CD4_ТЕ+226+"          "CD4_ТЕ+39+"           "CD4_ТЕ+DR+"          
##  [88] "CD4_ТЕ+PD-1+"         "CD4_ТЕ+PD-1+TIGIT-"   "CD4_ТЕ+PD-1+TIGIT+"  
##  [91] "CD4_ТЕ+PD-1-TIGIT-"   "CD4_ТЕ+PD-1-TIGIT+"   "CD4_ТЕ+TIGIT+"       
##  [94] "CD4_ТМ"               "CD8+_(IM STAT)"       "CD8+_"               
##  [97] "CD8+_226+"            "CD8+_39+"             "CD8+_DR+"            
## [100] "CD8+_PD-1+"           "CD8+_PD-1+TIGIT-"     "CD8+_PD-1+TIGIT+"    
## [103] "CD8+_PD-1-TIGIT-"     "CD8+_PD-1-TIGIT+"     "CD8+_TIGIT+"         
## [106] "CD8_EMTM"             "CD8_EMTM+226+"        "CD8_EMTM+39+"        
## [109] "CD8_EMTM+DR+"         "CD8_EMTM+PD-1+"       "CD8_EMTM+PD-1+TIGIT-"
## [112] "CD8_EMTM+PD-1+TIGIT+" "CD8_EMTM+PD-1-TIGIT-" "CD8_EMTM+PD-1-TIGIT+"
## [115] "CD8_EMTM+TIGIT+"      "CD8_NV(STAR2)"        "CD8_NV"              
## [118] "CD8_NV+226+"          "CD8_NV+39+"           "CD8_NV+DR+"          
## [121] "CD8_NV+PD-1+"         "CD8_NV+PD-1+TIGIT-"   "CD8_NV+PD-1+TIGIT+"  
## [124] "CD8_NV+PD-1-TIGIT-"   "CD8_NV+PD-1-TIGIT+"   "CD8_NV+TIGIT+"       
## [127] "CD8_TREG"             "CD8_ЕМ"               "CD8_СМ(STAR2)"       
## [130] "CD8_СМ"               "CD8_СМ+226+"          "CD8_СМ+39+"          
## [133] "CD8_СМ+DR+"           "CD8_СМ+PD-1+"         "CD8_СМ+PD-1+TIGIT-"  
## [136] "CD8_СМ+PD-1+TIGIT+"   "CD8_СМ+PD-1-TIGIT-"   "CD8_СМ+PD-1-TIGIT+"  
## [139] "CD8_СМ+TIGIT+"        "CD8_ТЕ(STAR2)"        "CD8_ТЕ"              
## [142] "CD8_ТЕ+226+"          "CD8_ТЕ+39+"           "CD8_ТЕ+DR+"          
## [145] "CD8_ТЕ+PD-1+"         "CD8_ТЕ+PD-1+TIGIT-"   "CD8_ТЕ+PD-1+TIGIT+"  
## [148] "CD8_ТЕ+PD-1-TIGIT-"   "CD8_ТЕ+PD-1-TIGIT+"   "CD8_ТЕ+TIGIT+"       
## [151] "CD8_ТМ"               "TREG_CM"              "TREG_EMTM"           
## [154] "TREG_NV"              "TREG_TE"              "TREG+226+"           
## [157] "TREG+226+TIGIT-"      "TREG+226+TIGIT+"      "TREG+226-TIGIT-"     
## [160] "TREG+226-TIGIT+"      "TREG+39+"             "TREG+DR+"            
## [163] "TREG+PD-1+"           "TREG+TIGIT+"
rm(duplicates, replacements, unify_cell_name)

Crucial steps:

  1. cGVHD Flag: Adds a flag (cGVHD_flag) to indicate if a patient experienced chronic GVHD (1 if cGVHD_Diagnosis_Day is not NA, otherwise 0).
  2. Blood Test Timing: classifies the blood test timing into groups (e.g., cGVHD, +30_cGVHD).
  3. Cell Name Parsing: Extracts the exact immune cell name from the Names column.
  4. Defines a function unify_cell_name to standardize cell names using a list of replacements (e.g., PD1 -> PD-1, 4 -> CD4_).

1.1. Rearrange Data

# Check the number of unique days
unique_days <- unique(df_clean$Blood_test_day)
print(unique_days)
##  [1] "180"       "30"        "365"       "60"        "90"        "180_cGVHD"
##  [7] "30_cGVHD"  "60_cGVHD"  "90_cGVHD"  "cGVHD"
# Transform the dataframe
df_transformed <- df_clean %>%
  group_by(Subject_ID, Blood_test_day) %>% # Group by Subject and Day of Blood Test
  pivot_wider(
    id_cols = c(Subject_ID, Observation_Days_Count, cGVHD_Diagnosis_Day, cGVHD_flag, Blood_test_day), # Columns to retain as is
    names_from = Cell_name_unified, # New columns will be created from this column's values
    values_from = Cell_Count, # Populate new columns with values from this column
    values_fill = list(Cell_Count = NA) # Fill missing values with NA
  ) %>%
  ungroup()

# Check if in any cell contain more than 1 number
multi_value_cells <- apply(df_transformed, 2, function(column) {
  any(grepl(",", column, fixed = TRUE) | grepl(" ", column))
})

contains_multiple_values <- ifelse(any(multi_value_cells), "Yes", "No")
print(contains_multiple_values)
## [1] "Yes"
rm(unique_days, contains_multiple_values)

Crucial steps:

  1. Reshape the data, creating separate columns for each combination of Cell_name_unified and Blood_test_day. Missing values are filled with NA.
  2. Checks if any transformed cell contains multiple values
  3. Randomly selects a patient and compares their raw Cell_Count data.

1.2. Check for duplicates

df_transformed_without_duplicated <- df_transformed %>%
  select(-c(Blood_test_day, Observation_Days_Count, cGVHD_Diagnosis_Day, cGVHD_flag )) %>%
  distinct()

2. Descriptive Statistics

# Prepare df_transformed
df_transformed <- df_transformed %>%
  mutate(
    Subject_ID = as.factor(Subject_ID),
    cGVHD_flag = as.factor(cGVHD_flag)
  )

# List of descriptive statistics
statistics <- list(
  `Number of values` = ~as.character(sum(!is.na(.x))),
  `No data` = ~as.character(sum(is.na(.x))),
  `Mean` = ~ifelse(sum(!is.na(.x)) == 0, "NA", as.character(mean(.x, na.rm = TRUE) %>% round(2))),
  `Median` = ~ifelse(sum(!is.na(.x)) == 0, "NA", as.character(median(.x, na.rm = TRUE) %>% round(2))),
  `SD` = ~ifelse(sum(!is.na(.x)) < 3, "NA", as.character(sd(.x, na.rm = TRUE) %>% round(2))),
  `min` = ~ifelse(sum(!is.na(.x)) == 0, "NA", as.character(min(.x, na.rm = TRUE) %>% round(2))),
  `max` = ~ifelse(sum(!is.na(.x)) == 0, "NA", as.character(max(.x, na.rm = TRUE) %>% round(2)))
)

# Summarize the statistics for each numeric variable and make variables into columns
df_summary <- df_transformed %>%
  select(where(is.numeric)) %>%
  summarise(across(everything(), statistics, .names = "{.col}__{.fn}")) %>%
  pivot_longer(cols = everything(), names_sep = "__", names_to = c("Variable", "Stat"), values_to = "Value") %>%
  pivot_wider(names_from = Stat, values_from = Value) %>%
  slice(1:25)

# Create a flextable for the summary
flextable(df_summary) %>%
  autofit() %>%
  border_inner(border = fp_border(color = "black", width = 1)) %>%
  border_outer(border = fp_border(color = "black", width = 1))

Variable

Number of values

No data

Mean

Median

SD

min

max

_Number of values

_No data

_Mean

_Median

_SD

_min

_max

Observation_Days_Count

231

0

538.87

552

140.45

155

817

cGVHD_Diagnosis_Day

145

86

196.94

182

78.58

89

502

CD4_TFH

231

0

34.21

27.37

29.43

0

168.34

CD4_TH1

231

0

31.4

20.03

48.3

0.03

608.2

CD4_TH17

231

0

20.01

15.2

16.5

0.02

109.23

CD4_TH17TO1

231

0

5.93

3.75

6.51

0.03

39.95

CD4_TH2

231

0

20.79

14.48

22.47

0.02

143.45

CD4_TH22

231

0

7.42

4.91

7.82

0

43.52

CD4+_(IM STAT)

231

0

93.49

78.4

80.61

0

535.83

CD4+

231

0

110.47

91.29

91.72

0.01

561.4

CD4+_226+

231

0

243.48

199.65

173.16

0.31

1046.46

CD4+_39+

231

0

58.67

47.56

56.12

0.07

379.73

CD4+_DR+

231

0

107.49

80.2

111.56

0.05

960.16

CD4+_PD-1+

231

0

146.36

116.88

118.65

0.33

789.05

CD4+_PD-1+TIGIT-

231

0

79.22

64.8

62.39

0.28

335.51

CD4+_PD-1+TIGIT+

231

0

67.14

52.3

65.54

0.05

526.23

CD4+_PD-1-TIGIT-

231

0

104.29

76.47

91.53

0.03

537.94

CD4+_PD-1-TIGIT+

231

0

10.11

6.67

13.56

0

154.31

CD4+_TIGIT+

231

0

77.25

59.32

75.19

0.05

680.54

CD4_NV(STAR2)

231

0

49.66

30.32

60.85

0

510.77

CD4_NV

231

0

53.69

34.57

60.86

0

420.64

CD4_NV_TH1

231

0

3.4

1.74

5.05

0

31.18

CD4_NV_TH17

231

0

1.49

1.02

1.58

0

8.85

CD4_NV_TH17TO1

231

0

0.32

0.15

0.42

0

2.7

CD4_NV_TH2

231

0

6.02

3.46

8.19

0

57.11

# Clean and summarize the categorical data for all factor variables
df_transformed %>%
  select(-Subject_ID) %>%
  select(where(is.factor)) %>%
  pivot_longer(cols = everything(), names_to = "Variable", values_to = "Value") %>%
  group_by(Variable, Value) %>%
  summarise(n = n(), .groups = 'drop') %>%
  group_by(Variable) %>%
  mutate(`No data` = sum(is.na(Value)),
         `% by group` = (n / sum(n)) * 100) %>%
  ungroup() %>%
  select(Variable, Value, n, `% by group`, `No data`) %>%
  arrange(Variable, Value) %>%
  flextable() %>%
  merge_v("Variable") %>%
  autofit() %>%
  border_inner(border = fp_border(color = "black", width = 1)) %>%
  border_outer(border = fp_border(color = "black", width = 1))

Variable

Value

n

% by group

No data

cGVHD_flag

0

86

37.22944

0

1

145

62.77056

0

rm(df_summary, statistics)

The resulting table is too large and complex for effective understanding due to the number of numeric variables, but it highlights a high proportion of missing data (NA values).

3. Prediction of cGVHD outcome

3.1. Define DataSet

Survival Analysis are planned.

Event: Diagnosis of cGVHD (cGVHD_flag == 1).

Survival Time:

  • For patients with cGVHD: cGVHD_Diagnosis_Day as the survival time.
  • For patients without cGVHD: Observation_Days_Count as the censored survival time.

Perform PCA to reduce 167 variables into fewer components that explain the maximum variance!

# Select days of interest
df_cGVHD_prediction <- df_transformed %>%
  filter(Blood_test_day %in% c("90", "180", "365", "cGVHD")) 

# df_cGVHD_prediction <- df_cGVHD_prediction %>%
# # Apply log transformation to numeric columns
#  mutate(
#     Observation_Days_Count = factor(Observation_Days_Count),
#     cGVHD_Diagnosis_Day = factor(cGVHD_Diagnosis_Day)
#   ) %>%
#   mutate(across(
#     where(is.numeric), 
#     ~ log10(. + 1), 
#     .names = "{col}"
#   )) %>%
#   mutate(
#     Observation_Days_Count = as.numeric(Observation_Days_Count),
#     cGVHD_Diagnosis_Day = as.numeric(cGVHD_Diagnosis_Day)
#   )

Crucial steps:

  1. The dataset (df_cGVHD_prediction) includes immune cell measurements at specific time points (_30, _60, _90, _180, _365, and _cGVHD) while excluding those associated with timing after chronic GVHD (e.g., _90_cGVHD).

3.2. Check for Missing Data

# Calculate missing values by Blood_test_day
missing_summary <- df_cGVHD_prediction %>%
  group_by(Blood_test_day) %>%
  summarise(across(everything(), ~ mean(is.na(.)) * 100)) %>%
  pivot_longer(cols = -Blood_test_day, names_to = "Column", values_to = "Missing_Percentage")

Crucial steps:

  1. No missing values by Blood test day were identified for any cell name.

3.3. Visualisation

3.3.1. Correlation martrix

# Select relevant numeric columns 
correlation_data <- df_cGVHD_prediction %>%
  filter(Blood_test_day %in% c("90")) %>%
  select(where(is.numeric))

# Compute correlation matrix
cor_matrix <- cor(correlation_data, use = "complete.obs")

# Mask weak correlations
cor_matrix[abs(cor_matrix) < 0.5] <- NA

# Plot only strong correlations
corrplot(cor_matrix, 
         method = "color", 
         type = "lower", 
         tl.col = "black", 
         tl.srt = 45, 
         number.cex = 0.1, 
         addCoef.col = "black", 
         mar = c(2, 2, 2, 2),
         cl.cex = 0.5,        
         tl.cex = 0.15)

# Create a table with corellation >= 0.9 as Variable1 variable 2 correlation coefficient
# Correlation matrix into a long format
correlation_table <- as.data.frame(as.table(cor_matrix))

# Rename the columns
colnames(correlation_table) <- c("Variable1", "Variable2", "Correlation")

# Filter for correlations >= 0.9
filtered_table <- correlation_table %>%
  filter(Correlation >= 0.9 & Variable1 != Variable2)

# Ensure no duplicate pairs by sorting Variable1 and Variable2
filtered_table <- filtered_table %>%
  mutate(
    Pair = paste0(pmin(Variable1, Variable2), "-", pmax(Variable1, Variable2))) %>%
  distinct(Pair, .keep_all = TRUE) %>%
  select(-Pair) %>%
  arrange(desc(Correlation)) %>%
  mutate(Num = row_number())

print(filtered_table)
##                Variable1          Variable2 Correlation Num
## 1     CD4_NV+PD-1-TIGIT-      CD4_NV(STAR2)   0.9988338   1
## 2               CD4_ЕМTM        CD4_ЕМ+226+   0.9986997   2
## 3         CD8+_(IM STAT)              CD8+_   0.9980477   3
## 4            CD4_СМ+226+      CD4_СМ(STAR2)   0.9980257   4
## 5            CD4_ТЕ+226+      CD4_ТЕ(STAR2)   0.9978914   5
## 6          CD4_ТЕ(STAR2)        CD4_ТЕ+226+   0.9978914   6
## 7               CD8+_DR+     CD8+_(IM STAT)   0.9969734   7
## 8              CD8+_226+     CD8+_(IM STAT)   0.9967175   8
## 9            CD8_NV+226+      CD8_NV(STAR2)   0.9957302   9
## 10           CD4+_TIGIT+   CD4+_PD-1+TIGIT+   0.9951726  10
## 11      CD4+_PD-1+TIGIT+        CD4+_TIGIT+   0.9951726  11
## 12           CD4_NV+226+      CD4_NV(STAR2)   0.9927777  12
## 13                CD4_NV      CD4_NV(STAR2)   0.9910998  13
## 14         CD4_NV(STAR2)             CD4_NV   0.9910998  14
## 15         CD4_ТЕ+TIGIT+ CD4_ТЕ+PD-1+TIGIT+   0.9896745  15
## 16           CD8_СМ+226+      CD8_СМ(STAR2)   0.9894197  16
## 17         CD8_СМ(STAR2)        CD8_СМ+226+   0.9894197  17
## 18            CD4_ТM_TH1            CD4_TH1   0.9862180  18
## 19               CD4_TH1         CD4_ТM_TH1   0.9862180  19
## 20        CD4_ТM_TH17TO1        CD4_TH17TO1   0.9849283  20
## 21           CD4_TH17TO1     CD4_ТM_TH17TO1   0.9849283  21
## 22    CD8_NV+PD-1-TIGIT-      CD8_NV(STAR2)   0.9847227  22
## 23            CD8_ТЕ+DR+     CD8+_(IM STAT)   0.9824943  23
## 24      CD8+_PD-1-TIGIT+     CD8+_(IM STAT)   0.9775403  24
## 25    CD4_ЕМ+PD-1+TIGIT-   CD4+_PD-1+TIGIT-   0.9770290  25
## 26  CD8_EMTM+PD-1+TIGIT+     CD8+_(IM STAT)   0.9755977  26
## 27           CD4_ТM_TH22           CD4_TH22   0.9754262  27
## 28              CD4_TH22        CD4_ТM_TH22   0.9754262  28
## 29           CD8_ТЕ+226+     CD8+_(IM STAT)   0.9754144  29
## 30         CD8_ТЕ(STAR2)     CD8+_(IM STAT)   0.9734285  30
## 31  CD8_EMTM+PD-1-TIGIT-         CD4_ЕМ_TH1   0.9717046  31
## 32               TREG_CM           CD4_ТREG   0.9710022  32
## 33              CD4_ТREG            TREG_CM   0.9710022  33
## 34           CD8+_TIGIT+     CD8+_(IM STAT)   0.9705005  34
## 35    CD8_ТЕ+PD-1-TIGIT+     CD8+_(IM STAT)   0.9690826  35
## 36            CD4_ЕМ+DR+           CD4+_DR+   0.9679522  36
## 37            CD8+_PD-1+     CD8+_(IM STAT)   0.9661445  37
## 38       TREG+226-TIGIT+           CD4_ТREG   0.9654008  38
## 39              TREG+39+           CD4_ТREG   0.9634220  39
## 40         CD8_СМ+TIGIT+      CD8_СМ(STAR2)   0.9627868  40
## 41              TREG+DR+           CD4_ТREG   0.9613748  41
## 42           TREG+TIGIT+           CD4_ТREG   0.9606648  42
## 43    CD8_NV+PD-1+TIGIT+       CD8_NV+PD-1+   0.9592687  43
## 44          CD8_NV+PD-1+ CD8_NV+PD-1+TIGIT+   0.9592687  44
## 45            CD8_СМ+DR+      CD8_СМ(STAR2)   0.9575376  45
## 46          CD8_EMTM+39+         CD4_ЕМ_TH1   0.9568416  46
## 47            CD4+_PD-1+           CD4+_DR+   0.9565507  47
## 48              CD4+_DR+         CD4+_PD-1+   0.9565507  48
## 49             TREG_EMTM           CD4_ТREG   0.9564104  49
## 50            TREG+PD-1+           CD4_ТREG   0.9559641  50
## 51            CD4_ЕМ+39+           CD4+_39+   0.9540809  51
## 52              CD4+_39+         CD4_ЕМ+39+   0.9540809  52
## 53  CD8_EMTM+PD-1+TIGIT-         CD4_ЕМ_TH1   0.9529874  53
## 54      CD8+_PD-1+TIGIT-     CD8+_(IM STAT)   0.9528599  54
## 55                CD8_ТЕ     CD8+_(IM STAT)   0.9497343  55
## 56                CD4_ТМ        CD4_ЕМ+226+   0.9494617  56
## 57    CD8_NV+PD-1-TIGIT+      CD8_NV+TIGIT+   0.9481596  57
## 58         CD8_EMTM+226+         CD4_ЕМ_TH1   0.9462705  58
## 59         CD4_СМ+TIGIT+       CD4_СМ+PD-1+   0.9456306  59
## 60          CD8_EMTM+DR+         CD4_ЕМ_TH1   0.9448519  60
## 61                CD8_ЕМ         CD4_ЕМ_TH1   0.9428652  61
## 62    CD4_ТЕ+PD-1+TIGIT-            CD4_TH1   0.9422821  62
## 63      CD8+_PD-1-TIGIT-         CD4_ЕМ_TH1   0.9418677  63
## 64         CD8_NV+TIGIT+       CD8_NV+PD-1+   0.9410303  64
## 65    CD8_ТЕ+PD-1+TIGIT-   CD8+_PD-1+TIGIT-   0.9408341  65
## 66              CD8_EMTM         CD4_ЕМ_TH1   0.9400801  66
## 67    CD4_NV+PD-1+TIGIT-       CD4_NV+PD-1+   0.9396690  67
## 68          CD4_NV+PD-1+ CD4_NV+PD-1+TIGIT-   0.9396690  68
## 69                 CD4+_     CD4+_(IM STAT)   0.9382052  69
## 70        CD4+_(IM STAT)              CD4+_   0.9382052  70
## 71                CD4_СМ      CD4_СМ(STAR2)   0.9374218  71
## 72         CD4_СМ(STAR2)             CD4_СМ   0.9374218  72
## 73                CD8_ТМ     CD8+_(IM STAT)   0.9362897  73
## 74  CD8_EMTM+PD-1-TIGIT+         CD4_ЕМ_TH1   0.9360448  74
## 75       CD4_ТREG(STAR2)     CD4+_(IM STAT)   0.9354179  75
## 76    CD8_ТЕ+PD-1-TIGIT-         CD4_ЕМ_TH1   0.9350791  76
## 77    CD8_ТЕ+PD-1+TIGIT+         CD8+_PD-1+   0.9344868  77
## 78         CD8_ТЕ+TIGIT+     CD8+_(IM STAT)   0.9344091  78
## 79            CD8_СМ+39+      CD8_СМ(STAR2)   0.9337067  79
## 80       TREG+226+TIGIT+           CD4_ТREG   0.9336296  80
## 81          CD4_ТREG_TH2          TREG_EMTM   0.9329874  81
## 82      CD8+_PD-1+TIGIT+     CD8+_(IM STAT)   0.9310397  82
## 83           CD4_СМ_TH17           CD4_TH17   0.9275637  83
## 84              CD4_TH17        CD4_СМ_TH17   0.9275637  84
## 85    CD4_ЕМ+PD-1+TIGIT+   CD4+_PD-1+TIGIT+   0.9258802  85
## 86            CD8_ТЕ+39+         CD4_ТЕ+39+   0.9243269  86
## 87          CD4_ЕМ+PD-1+           CD4+_DR+   0.9225987  87
## 88         CD4_ТREG_TH17          TREG+226+   0.9210647  88
## 89        CD8_EMTM+PD-1+         CD4_ЕМ_TH1   0.9182152  89
## 90         CD4_ЕМ+TIGIT+   CD4+_PD-1+TIGIT+   0.9163401  90
## 91          CD8_СМ+PD-1+      CD8_СМ(STAR2)   0.9155320  91
## 92            CD8_NV+DR+   CD8+_PD-1-TIGIT+   0.9150977  92
## 93                CD8_СМ         CD4_ЕМ_TH1   0.9143635  93
## 94              CD8_TREG         CD4_ЕМ_TH1   0.9143576  94
## 95            CD4_ТЕ+DR+            CD4_TH1   0.9142354  95
## 96                 CD8+_         CD4_ЕМ_TH1   0.9141853  96
## 97                CD4_ТЕ             CD4_ЕМ   0.9109908  97
## 98                CD4_ЕМ             CD4_ТЕ   0.9109908  98
## 99              CD8+_39+         CD4_ЕМ_TH1   0.9103280  99
## 100          CD4_ЕМ+226+         CD4_ЕМ+DR+   0.9096396 100
## 101         CD4_СМ+PD-1+         CD4_СМ+DR+   0.9084474 101
## 102           CD4_СМ+DR+       CD4_СМ+PD-1+   0.9084474 102
## 103   CD4_СМ+PD-1+TIGIT+         CD4_СМ+DR+   0.9080798 103
## 104           CD4_ТЕ+39+         CD4_ТЕ+DR+   0.9076214 104
## 105   CD4_ТЕ+PD-1+TIGIT+       CD4_ТЕ+PD-1+   0.9071497 105
## 106        CD4_NV+TIGIT+ CD4_NV+PD-1-TIGIT+   0.9063595 106
## 107   CD4_NV+PD-1-TIGIT+      CD4_NV+TIGIT+   0.9063595 107
## 108            TREG+226+           CD4_ТREG   0.9059511 108
## 109         CD8_ТЕ+PD-1+ CD4_ТЕ+PD-1+TIGIT+   0.9045578 109
## 110         CD4_ТЕ+PD-1+           CD4+_DR+   0.9042370 110
## 111   CD8_СМ+PD-1+TIGIT+      CD8_СМ(STAR2)   0.9031949 111
## 112           CD4_ТЕ_TH1         CD4_ЕМ_TH1   0.9030272 112
## 113           CD4_ЕМ_TH1         CD4_ТЕ_TH1   0.9030272 113
## 114     CD4+_PD-1+TIGIT-           CD4+_DR+   0.9027076 114
## 115               CD8_NV      CD8_NV(STAR2)   0.9014977 115
## 116        CD8_NV(STAR2)             CD8_NV   0.9014977 116
## 117      CD8_EMTM+TIGIT+         CD4_ЕМ_TH1   0.9006824 117
## 118   CD8_СМ+PD-1+TIGIT-        CD8_СМ+226+   0.9003490 118
# Check correlations for 'cGVHD_Diagnosis_Day' greater than 0.6
cgvhd_filtered_table <- correlation_table %>%
  filter(
    (Variable1 == "cGVHD_Diagnosis_Day") &
    Correlation > 0.6 & 
    (Variable1 != Variable2)
  ) %>%
  arrange(desc(Correlation)) %>%
  mutate(Num = row_number())

# Print the filtered table
print(cgvhd_filtered_table)
##             Variable1       Variable2 Correlation Num
## 1 cGVHD_Diagnosis_Day        CD4_ТREG   0.7115962   1
## 2 cGVHD_Diagnosis_Day         TREG_CM   0.6923027   2
## 3 cGVHD_Diagnosis_Day TREG+226-TIGIT+   0.6656244   3
## 4 cGVHD_Diagnosis_Day     CD4_ТЕ_TH17   0.6649963   4
## 5 cGVHD_Diagnosis_Day        TREG+39+   0.6364174   5
## 6 cGVHD_Diagnosis_Day       TREG_EMTM   0.6356658   6
## 7 cGVHD_Diagnosis_Day     TREG+TIGIT+   0.6186489   7
## 8 cGVHD_Diagnosis_Day        TREG+DR+   0.6134311   8
## 9 cGVHD_Diagnosis_Day      TREG+PD-1+   0.6018802   9
  • In linear regression or logistic regression, multicollinearity makes it difficult to determine the unique contribution of each predictor variable.

Models that are not sensitive to multicollinearity because they split the data hierarchically:

  • Random Forest:
  • XGBoost

3.3.2. Heatmap with clusters

# Select relevant numeric columns 
correlation_data <- df_cGVHD_prediction %>%
  filter(Blood_test_day %in% c("90")) %>%
  select(where(is.numeric), -cGVHD_Diagnosis_Day)

# Compute correlation matrix
cor_matrix <- cor(correlation_data, use = "complete.obs")

# Hierarchical clustering and heatmap
pheatmap(cor_matrix,
         cluster_rows = TRUE,
         cluster_cols = TRUE,
         display_numbers = FALSE,
         fontsize_row = 2,
         fontsize_col = 2,
         main = "Clustered Heatmap for Survival Variables",
         width = 20, 
        height = 20  )

# Perform hierarchical clustering
# Convert correlation to distance
distance_matrix <- as.dist(1 - cor_matrix) 
hclust_result <- hclust(distance_matrix, method = "ward.D2") # Ward's method

# Cut the dendrogram into clusters (replace 5 with your desired number of clusters)
clusters <- cutree(hclust_result, k = 6) # Extract 5 clusters

# Create a data frame to associate variables with their clusters
clustered_data <- data.frame(
  Variable = rownames(cor_matrix),
  Cluster = clusters
)

# Dendrogram with clusters
plot(hclust_result, labels = rownames(cor_matrix), main = "Dendrogram of Clusters", cex = 0.2)

# Add rectangles to highlight clusters
rect.hclust(hclust_result, k = 5, border = 2:6)

custom_order <- c("90", "180", "365", "cGVHD")

# Convert Blood_test_day to a factor with the custom order
df_cGVHD_transformed <- df_cGVHD_prediction %>%
  mutate(Blood_test_day = factor(Blood_test_day, levels = custom_order))

# Arrange data by the specified order of Blood_test_day
df_cGVHD_transformed <- df_cGVHD_transformed %>%
  arrange(Blood_test_day)

# Aggregate data by Blood_test_day
heatmap_data <- df_cGVHD_transformed %>%
  group_by(Blood_test_day) %>%
  summarise(across(where(is.numeric), mean, na.rm = TRUE)) %>%
  ungroup()

# Convert to matrix for heatmap plotting
heatmap_matrix <- heatmap_data %>%
  column_to_rownames("Blood_test_day") %>%
  as.matrix()

# Create the heatmap
pheatmap(heatmap_matrix,
         cluster_rows = FALSE,
         cluster_cols = TRUE,
         display_numbers = FALSE,
         fontsize_row = 6,
         fontsize_col = 3,
         main = "Heatmap: Variable Distribution by Blood Test Day")

Some variable distributions seem to change over time (Blood Test Day), with certain groups (e.g., those in warmer tones) showing higher activity at specific days, such as 365 (for non-log-transformed data).

3.3.3. PCA

# Filter and normalize data while preserving cGVHD_flag
df_normalized <- df_transformed %>%
  filter(Blood_test_day %in% c("90", "180", "365", "cGVHD")) %>%
  # Ensure cGVHD_flag is retained for later use
  select(cGVHD_flag, where(is.numeric), -Observation_Days_Count, -cGVHD_Diagnosis_Day) %>%
  mutate(across(where(is.numeric), ~ (.-mean(.)) / sd(.), .names = "z_{col}"))

# Save cGVHD_flag for later use
cGVHD_flags <- df_normalized$cGVHD_flag

# Perform near-zero variance filtering (excluding cGVHD_flag)
nzv <- nearZeroVar(df_normalized %>% select(-cGVHD_flag))
df_filtered <- df_normalized %>%
  select(-nzv, cGVHD_flag) %>% # Retain cGVHD_flag after filtering
  filter(complete.cases(.))

# Perform PCA (exclude cGVHD_flag from PCA computation)
pca_results <- prcomp(df_filtered %>% select(-cGVHD_flag), scale. = TRUE)

# Attach PCA scores and cGVHD_flag back together
pca_scores <- as.data.frame(pca_results$x) %>%
  mutate(cGVHD_flag = cGVHD_flags)

# Explained variance calculations
explained_variance <- pca_results$sdev^2 / sum(pca_results$sdev^2)
cumulative_variance <- cumsum(explained_variance)

# Create a data frame for plotting explained variance
explained_variance_data <- data.frame(
  PC = 1:length(explained_variance),
  Explained_Variance = explained_variance,
  Cumulative_Variance = cumulative_variance
)

# Filter for components contributing to the first 80% of cumulative variance
explained_variance_data_filtered <- explained_variance_data %>%
  filter(Cumulative_Variance <= 0.8)

# Add percentage labels for explained variance
explained_variance_data_filtered <- explained_variance_data_filtered %>%
  mutate(Variance_Percentage_Label = paste0(round(Explained_Variance * 100, 2), "%"))

# Plot explained variance with percentage labels
ggplot(explained_variance_data_filtered, aes(x = PC)) +
  geom_bar(aes(y = Explained_Variance), stat = "identity", fill = "steelblue") +
  geom_text(aes(y = Explained_Variance, label = Variance_Percentage_Label), 
            vjust = -0.5, size = 3.5) +
  geom_line(aes(y = Cumulative_Variance), color = "red", size = 1) +
  geom_point(aes(y = Cumulative_Variance), color = "red", size = 2) +
  scale_y_continuous(
    name = "Variance Explained",
    sec.axis = sec_axis(~., name = "Cumulative Variance Explained")
  ) +
  labs(
    title = "Explained Variance by Principal Components (First 80%)",
    x = "Principal Component"
  ) +
  theme_minimal(base_size = 14)

# Perform clustering on PCA scores
set.seed(123)
wss <- sapply(1:10, function(k) {
  kmeans(pca_scores[, 1:10], centers = k, nstart = 25)$tot.withinss
})

# Plot Elbow Method
elbow_plot <- data.frame(Clusters = 1:10, WSS = wss)
ggplot(elbow_plot, aes(x = Clusters, y = WSS)) +
  geom_line() +
  geom_point(size = 3) +
  labs(
    title = "Elbow Method for Optimal Clusters",
    x = "Number of Clusters",
    y = "Total Within-Cluster Sum of Squares (WSS)"
  ) +
  theme_minimal(base_size = 14)

# Apply K-means clustering
optimal_clusters <- 4
kmeans_result <- kmeans(pca_scores[, 1:10], centers = optimal_clusters, nstart = 25)
pca_scores$Cluster <- as.factor(kmeans_result$cluster)

# Visualize clusters with coloring by cGVHD_flag
ggplot(pca_scores, aes(x = PC1, y = PC2, color = as.factor(cGVHD_flag))) +
  geom_point(size = 2, alpha = 0.8) +  # Scatterplot of PCA points
  stat_ellipse(aes(group = Cluster), type = "t", linetype = "dashed", size = 1, color = "black") +  # Ellipses for clusters
  scale_color_brewer(palette = "Set1") +  # Color palette for cGVHD_flag
  labs(
    title = "PCA Clusters with cGVHD_flag Coloring and Cluster Ellipses",
    x = "PC1 (Principal Component 1)",
    y = "PC2 (Principal Component 2)",
    color = "cGVHD_flag"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    legend.position = "bottom",
    plot.title = element_text(face = "bold", hjust = 0.5)
  )
## Too few points to calculate an ellipse

3.3.4. BORUTA algorithm

# Load Boruta package
library(Boruta)

df_normalized <- df_transformed %>%
  filter(Blood_test_day %in% c("90", "180", "365", "cGVHD")) %>%
  select(where(is.numeric), -Observation_Days_Count, -cGVHD_Diagnosis_Day)

nzv <- nearZeroVar(df_normalized) 
df_filtered <- df_normalized[, -nzv] # near-zero variance filtering

df_filtered <- df_normalized %>%
  filter(complete.cases(.)) # Delete NA

df_filtered$cGVHD_flag <- as.factor(df_cGVHD_prediction$cGVHD_flag)

# Apply the Boruta algorithm for feature selection
boruta_result <- Boruta(cGVHD_flag ~ ., data = df_filtered)

final_decision <- attStats(boruta_result)
print(final_decision)
##                           meanImp    medianImp       minImp    maxImp
## CD4_TFH               0.275937126  0.363346656 -1.373304148 2.2045750
## CD4_TH1               0.371346040  0.505538752 -0.597288351 1.7062409
## CD4_TH17              0.954583268  0.911847089 -0.274445066 2.3211816
## CD4_TH17TO1           0.318146704  0.300188508 -1.038701171 2.0288875
## CD4_TH2               0.964668073  1.043783003 -0.959916111 2.5666241
## CD4_TH22              3.078155557  3.089281887 -0.893501999 5.3328783
## CD4+_(IM STAT)        5.399603582  5.670148763  1.856439134 7.4640394
## CD4+_                 4.323622392  4.526130957  0.880961813 6.6238344
## CD4+_226+             3.260640034  3.487806941 -0.224825581 5.7043893
## CD4+_39+              0.889326587  1.317629337 -2.155066507 2.5682610
## CD4+_DR+              0.401756149  0.380293275 -1.468688699 1.9918155
## CD4+_PD-1+            1.424442622  1.622104770 -0.288787886 2.8361988
## CD4+_PD-1+TIGIT-      0.765410783  0.720661346 -0.806574946 2.1343221
## CD4+_PD-1+TIGIT+      0.639586213  0.510466926 -0.999191981 1.9270155
## CD4+_PD-1-TIGIT-      0.170480823  0.235217125 -1.326237118 2.1653324
## CD4+_PD-1-TIGIT+      0.341183177  0.376714784 -1.157962320 1.9185238
## CD4+_TIGIT+           0.965902316  0.910284371 -0.783330140 2.0542812
## CD4_NV(STAR2)         1.048050974  1.392850096 -1.535950567 2.6391562
## CD4_NV                1.055530395  1.159838566 -0.767799105 2.4424385
## CD4_NV_TH1            1.460189943  1.433241451 -0.699308415 4.1650816
## CD4_NV_TH17          -0.456175317 -0.101836264 -2.536597757 0.7835719
## CD4_NV_TH17TO1        1.176414946  1.021924886  0.192406437 2.3896965
## CD4_NV_TH2            0.305547257  0.162967488 -1.007725134 2.4910715
## CD4_NV_TH22           0.524241445  0.542678857 -0.896401103 1.8755963
## CD4_NV+226+           1.186467866  1.421094029 -0.814809588 2.7325232
## CD4_NV+39+            0.816701613  1.008122887 -1.503421339 2.4853659
## CD4_NV+DR+           -0.303138152 -0.459627699 -1.583938505 2.0959799
## CD4_NV+PD-1+          1.211566047  1.105250735 -1.401405853 3.2577951
## CD4_NV+PD-1+TIGIT-    0.408598443  0.415829048 -1.529148202 1.9143535
## CD4_NV+PD-1+TIGIT+    4.614293992  4.875804653  1.558003660 6.4816540
## CD4_NV+PD-1-TIGIT-    1.154487236  1.218659233 -0.164363022 2.4240605
## CD4_NV+PD-1-TIGIT+    0.957961906  1.186988904 -1.177357774 2.4579466
## CD4_NV+TIGIT+         3.206585735  3.395910125 -0.086492946 5.3710123
## CD4_ЕМ                1.294249975  1.443855539 -0.111568456 3.0580621
## CD4_ЕМ_TH1            4.425039085  4.522884870  0.715454936 6.7525340
## CD4_ЕМ_TH17           0.862665350  0.678974214 -0.051625343 2.1812339
## CD4_ЕМ_TH17TO1        7.277045876  7.590777300  2.817118145 9.7254706
## CD4_ЕМ_TH2            0.761870675  0.700256440 -0.564301649 2.3565401
## CD4_ЕМ_TH22           0.311942033  0.182454742 -1.584506954 1.8604464
## CD4_ЕМ+226+           1.543462670  1.587407351 -0.232483248 2.8254162
## CD4_ЕМ+39+            0.490873221  0.553009928 -2.322914198 2.1131446
## CD4_ЕМ+DR+            0.763199435  0.795206599 -0.242309562 1.8435584
## CD4_ЕМ+PD-1+          1.108661312  1.045366721 -0.587789935 2.1740283
## CD4_ЕМ+PD-1+TIGIT-    0.753533613  0.523980025 -0.594501837 2.2679027
## CD4_ЕМ+PD-1+TIGIT+    0.234784280  0.059973340 -0.926551321 1.7468386
## CD4_ЕМ+PD-1-TIGIT-    0.320463808  0.202897662 -1.585831542 1.9067665
## CD4_ЕМ+PD-1-TIGIT+   -0.171482584 -0.004365224 -2.487076483 1.1934384
## CD4_ЕМ+TIGIT+         0.313357674  0.206748881 -0.790478815 1.6519178
## CD4_ЕМTM              0.972442004  0.983866929 -0.693934252 2.5688909
## CD4_СМ(STAR2)         0.495014014  0.692458516 -1.545233541 2.0400186
## CD4_СМ                1.241441723  1.196864479 -0.080049060 2.3574531
## CD4_СМ_TH1            3.036100840  3.075430409  0.713682863 5.1344505
## CD4_СМ_TH17           2.684795946  2.749483043 -0.652442251 4.5181968
## CD4_СМ_TH17TO1        1.122030943  1.181403891 -0.164238168 2.2762473
## CD4_СМ_TH2            4.195865506  4.300009084  1.835914028 6.4573984
## CD4_СМ_TH22           6.324460886  6.578230411  2.902173827 8.4741417
## CD4_СМ+226+           0.611386315  0.526815767 -0.585970381 2.0100202
## CD4_СМ+39+            0.446946101  0.220428622 -1.709526057 1.9717870
## CD4_СМ+DR+            0.230021370  0.386396612 -2.035978470 1.8778436
## CD4_СМ+PD-1+         -0.073391310  0.066570761 -2.418947140 1.8099610
## CD4_СМ+PD-1+TIGIT-    0.114440809  0.067156146 -1.979122016 1.3230956
## CD4_СМ+PD-1+TIGIT+    0.306397669  0.440762455 -1.359269499 1.5219055
## CD4_СМ+PD-1-TIGIT-    1.629357911  1.767858821  0.419367421 2.5323225
## CD4_СМ+PD-1-TIGIT+   -0.339016155 -0.599559338 -1.976389362 1.8409060
## CD4_СМ+TIGIT+         1.285639667  1.153823947  0.175817586 2.6885042
## CD4_ТM_TH1            0.285572512  0.464849558 -1.259993702 1.2872835
## CD4_ТM_TH17           0.292847496  0.304753088 -1.688745028 1.8010805
## CD4_ТM_TH17TO1        0.186263363  0.289384980 -1.914947952 2.0064351
## CD4_ТM_TH2            1.532194388  1.787953944 -1.806899825 3.7786427
## CD4_ТM_TH22           1.514777463  1.660335982 -0.504604612 3.1377345
## CD4_ТREG(STAR2)       5.460464061  5.689751384  1.756057874 7.3049698
## CD4_ТREG              1.265972506  1.221614144 -0.348575251 3.0156694
## CD4_ТREG_TH1          0.247627280  0.287753203 -1.589943994 2.2532309
## CD4_ТREG_TH17         3.022393839  3.122630236 -0.278669754 5.0950106
## CD4_ТREG_TH17TO1      0.255921491  0.377248439 -1.322269270 1.9335020
## CD4_ТREG_TH2          1.105196155  1.123717180 -0.933057646 3.2852182
## CD4_ТREG_TH22         0.987104619  1.005433096 -0.220064145 2.0204995
## CD4_ТЕ(STAR2)        -0.009239855 -0.046758290 -1.634244762 2.1583041
## CD4_ТЕ                0.081877250  0.083742266 -1.402301564 0.9325371
## CD4_ТЕ_TH1            0.963556997  0.956837277 -1.484088657 3.0604074
## CD4_ТЕ_TH17          -0.142582286 -0.222013362 -1.145531133 1.5803324
## CD4_ТЕ_TH17TO1        0.357180911  0.209989219 -0.819992371 1.5535715
## CD4_ТЕ_TH2           -0.037146397 -0.121007715 -1.576568762 1.9996748
## CD4_ТЕ_TH22          -0.378647968 -0.391869623 -1.910511822 1.6400068
## CD4_ТЕ+226+          -0.084977465  0.029119843 -1.779651338 1.9701350
## CD4_ТЕ+39+            0.225154179  0.242366746 -1.393640860 2.5706828
## CD4_ТЕ+DR+            0.338128404  0.280995104 -1.423633571 2.1610768
## CD4_ТЕ+PD-1+         -0.337577700 -0.212831518 -1.902984590 1.3309814
## CD4_ТЕ+PD-1+TIGIT-   -0.086482143  0.101183134 -1.440889287 1.4999073
## CD4_ТЕ+PD-1+TIGIT+   -0.253567867 -0.432656154 -2.025138072 1.4108459
## CD4_ТЕ+PD-1-TIGIT-    0.446631026  0.327430980 -0.864384179 1.7955601
## CD4_ТЕ+PD-1-TIGIT+    0.477131422  0.479091653 -0.880232336 2.2052402
## CD4_ТЕ+TIGIT+        -0.208655903 -0.492259251 -2.270875739 1.8804063
## CD4_ТМ                3.118855993  3.289358142 -0.604273718 6.1176763
## CD8+_(IM STAT)        1.245543351  1.134891343 -0.502711411 2.9189639
## CD8+_                 4.006916369  4.210659672  0.841928294 6.5756355
## CD8+_226+             2.483765013  2.428979471 -0.808693052 5.2130788
## CD8+_39+             -0.379063321 -0.155133723 -2.535763344 1.0747775
## CD8+_DR+              1.988255602  2.097422946 -0.793886512 3.8046116
## CD8+_PD-1+            0.869200605  1.123658801 -1.585076394 2.9229171
## CD8+_PD-1+TIGIT-      0.590896132  0.820991236 -1.915853491 2.7986601
## CD8+_PD-1+TIGIT+      1.510523501  1.777858532 -0.932663694 2.6355907
## CD8+_PD-1-TIGIT-      0.316711217  0.470918180 -1.569133034 1.8785878
## CD8+_PD-1-TIGIT+      1.075033040  1.120514102 -1.010299501 2.6145925
## CD8+_TIGIT+           0.793815895  0.639361358 -1.294132354 2.0755349
## CD8_EMTM              3.511573033  3.590030415  0.957769869 6.8932173
## CD8_EMTM+226+         3.024237797  2.993001452  0.310661511 6.7024385
## CD8_EMTM+39+          0.466648592  0.474180799 -0.531980283 1.9855748
## CD8_EMTM+DR+          4.300386653  4.261313283  1.581026844 6.9364673
## CD8_EMTM+PD-1+        3.775876235  3.749905232  0.902595568 6.1062808
## CD8_EMTM+PD-1+TIGIT-  0.380158011  0.422706828 -2.296118333 1.7327369
## CD8_EMTM+PD-1+TIGIT+  3.589005859  3.582403241  0.368144410 6.5470238
## CD8_EMTM+PD-1-TIGIT-  1.928907557  1.862232666 -0.108248889 4.3267920
## CD8_EMTM+PD-1-TIGIT+  3.104211759  3.256385791 -0.230726558 5.0558569
## CD8_EMTM+TIGIT+       4.114975055  4.155144619  0.095649822 6.3890315
## CD8_NV(STAR2)         0.872511346  0.709799261 -0.932825493 3.3741478
## CD8_NV                0.793118171  0.542076392 -0.511067939 2.1592329
## CD8_NV+226+           0.613828241  0.937545955 -1.569659968 3.5044567
## CD8_NV+39+            0.478580415  0.579555459 -1.699584701 2.9321011
## CD8_NV+DR+            0.606439175  0.907015036 -0.526150552 1.6725525
## CD8_NV+PD-1+          0.931116648  0.914617239 -0.658030101 2.6969884
## CD8_NV+PD-1+TIGIT-   -0.077075230 -0.222199100 -1.509486405 2.1573345
## CD8_NV+PD-1+TIGIT+    0.519759990  0.445458950 -1.338224173 2.0992661
## CD8_NV+PD-1-TIGIT-    0.138326813  0.083642446 -1.274857745 1.6288564
## CD8_NV+PD-1-TIGIT+   -0.764273454 -0.658517969 -2.503523458 0.8065249
## CD8_NV+TIGIT+        -0.625253732 -0.600982393 -2.182651735 1.3664447
## CD8_TREG              4.573143874  4.713150088  1.264727338 7.1971468
## CD8_ЕМ                0.401006572  0.456698286 -1.604618625 2.5387477
## CD8_СМ(STAR2)         2.229309764  2.348587868 -0.774728333 4.1125219
## CD8_СМ                0.476739113  0.382370660 -0.855857856 2.0206210
## CD8_СМ+226+           0.992989548  0.930151474 -1.307638320 2.4558743
## CD8_СМ+39+            0.286903328  0.097155735 -1.778007951 2.1548406
## CD8_СМ+DR+            0.958350232  1.185277829 -0.218306476 2.6159870
## CD8_СМ+PD-1+          0.483315447  0.344323232 -1.613799766 2.3051279
## CD8_СМ+PD-1+TIGIT-    0.232948932  0.307150442 -1.165483388 1.2196431
## CD8_СМ+PD-1+TIGIT+    0.390676863  0.478643468 -1.076552015 1.4154909
## CD8_СМ+PD-1-TIGIT-   -0.132923436 -0.383993158 -1.087359733 1.1442626
## CD8_СМ+PD-1-TIGIT+    0.170795339  0.331203802 -2.146338441 1.4848629
## CD8_СМ+TIGIT+         1.262250296  1.219404311 -0.670622528 2.9762162
## CD8_ТЕ(STAR2)         1.140118583  1.189347883 -0.765343890 2.4204819
## CD8_ТЕ                2.744561912  2.805599848 -0.055792162 4.6806426
## CD8_ТЕ+226+           0.756570542  0.788621745 -0.635056509 2.4760316
## CD8_ТЕ+39+           -0.491470208 -0.586552278 -1.652677715 0.6791737
## CD8_ТЕ+DR+            0.890659976  0.491023615 -0.541953142 2.8469106
## CD8_ТЕ+PD-1+          0.220095471  0.020839703 -1.105064261 2.1362759
## CD8_ТЕ+PD-1+TIGIT-    0.087018691  0.022795045 -1.880146612 1.9684677
## CD8_ТЕ+PD-1+TIGIT+    0.435369019  0.580735233 -1.893484410 1.7130266
## CD8_ТЕ+PD-1-TIGIT-    0.899762325  0.733538690 -0.448767627 2.4188596
## CD8_ТЕ+PD-1-TIGIT+    1.067246563  1.168130080 -0.863593005 1.9096426
## CD8_ТЕ+TIGIT+         1.372222386  1.177453128 -1.079961512 2.9142020
## CD8_ТМ                1.234091031  1.110640280 -0.027192845 2.5175713
## TREG_CM               0.191214788  0.211638335 -1.091895070 1.8863218
## TREG_EMTM             2.439858040  2.442110955 -0.114936504 5.3648939
## TREG_NV               0.356283485  0.582730263 -1.311403623 1.4648789
## TREG_TE               4.246379656  4.282526372  1.885810026 6.6914198
## TREG+226+             1.442358303  1.751763113 -0.674603118 2.7704962
## TREG+226+TIGIT-      -0.015635124  0.006790233 -1.558522544 1.6397693
## TREG+226+TIGIT+       1.715041587  1.766836755 -0.002211773 3.9427960
## TREG+226-TIGIT-       0.755989274  0.638027510 -0.246772753 2.0734027
## TREG+226-TIGIT+       0.829909315  0.867239697 -0.497418970 2.1099737
## TREG+39+              0.847837242  0.865698782 -1.164731512 1.9398886
## TREG+DR+              1.658350976  1.583106645  0.032533031 3.8202896
## TREG+PD-1+            0.821897718  0.672039469 -1.858199092 2.9088199
## TREG+TIGIT+           2.957800390  3.015045454  0.184339334 5.3598859
##                        normHits  decision
## CD4_TFH              0.01010101  Rejected
## CD4_TH1              0.00000000  Rejected
## CD4_TH17             0.00000000  Rejected
## CD4_TH17TO1          0.00000000  Rejected
## CD4_TH2              0.00000000  Rejected
## CD4_TH22             0.47474747 Tentative
## CD4+_(IM STAT)       0.88888889 Confirmed
## CD4+_                0.78787879 Confirmed
## CD4+_226+            0.53535354 Tentative
## CD4+_39+             0.00000000  Rejected
## CD4+_DR+             0.00000000  Rejected
## CD4+_PD-1+           0.01010101  Rejected
## CD4+_PD-1+TIGIT-     0.00000000  Rejected
## CD4+_PD-1+TIGIT+     0.00000000  Rejected
## CD4+_PD-1-TIGIT-     0.00000000  Rejected
## CD4+_PD-1-TIGIT+     0.00000000  Rejected
## CD4+_TIGIT+          0.00000000  Rejected
## CD4_NV(STAR2)        0.01010101  Rejected
## CD4_NV               0.00000000  Rejected
## CD4_NV_TH1           0.06060606  Rejected
## CD4_NV_TH17          0.00000000  Rejected
## CD4_NV_TH17TO1       0.00000000  Rejected
## CD4_NV_TH2           0.00000000  Rejected
## CD4_NV_TH22          0.00000000  Rejected
## CD4_NV+226+          0.00000000  Rejected
## CD4_NV+39+           0.00000000  Rejected
## CD4_NV+DR+           0.00000000  Rejected
## CD4_NV+PD-1+         0.03030303  Rejected
## CD4_NV+PD-1+TIGIT-   0.00000000  Rejected
## CD4_NV+PD-1+TIGIT+   0.83838384 Confirmed
## CD4_NV+PD-1-TIGIT-   0.00000000  Rejected
## CD4_NV+PD-1-TIGIT+   0.00000000  Rejected
## CD4_NV+TIGIT+        0.50505051 Tentative
## CD4_ЕМ               0.00000000  Rejected
## CD4_ЕМ_TH1           0.77777778 Confirmed
## CD4_ЕМ_TH17          0.00000000  Rejected
## CD4_ЕМ_TH17TO1       0.98989899 Confirmed
## CD4_ЕМ_TH2           0.00000000  Rejected
## CD4_ЕМ_TH22          0.00000000  Rejected
## CD4_ЕМ+226+          0.01010101  Rejected
## CD4_ЕМ+39+           0.00000000  Rejected
## CD4_ЕМ+DR+           0.00000000  Rejected
## CD4_ЕМ+PD-1+         0.00000000  Rejected
## CD4_ЕМ+PD-1+TIGIT-   0.00000000  Rejected
## CD4_ЕМ+PD-1+TIGIT+   0.00000000  Rejected
## CD4_ЕМ+PD-1-TIGIT-   0.00000000  Rejected
## CD4_ЕМ+PD-1-TIGIT+   0.00000000  Rejected
## CD4_ЕМ+TIGIT+        0.00000000  Rejected
## CD4_ЕМTM             0.00000000  Rejected
## CD4_СМ(STAR2)        0.00000000  Rejected
## CD4_СМ               0.00000000  Rejected
## CD4_СМ_TH1           0.49494949 Tentative
## CD4_СМ_TH17          0.36363636 Tentative
## CD4_СМ_TH17TO1       0.00000000  Rejected
## CD4_СМ_TH2           0.73737374 Confirmed
## CD4_СМ_TH22          0.95959596 Confirmed
## CD4_СМ+226+          0.00000000  Rejected
## CD4_СМ+39+           0.00000000  Rejected
## CD4_СМ+DR+           0.00000000  Rejected
## CD4_СМ+PD-1+         0.00000000  Rejected
## CD4_СМ+PD-1+TIGIT-   0.00000000  Rejected
## CD4_СМ+PD-1+TIGIT+   0.00000000  Rejected
## CD4_СМ+PD-1-TIGIT-   0.00000000  Rejected
## CD4_СМ+PD-1-TIGIT+   0.00000000  Rejected
## CD4_СМ+TIGIT+        0.01010101  Rejected
## CD4_ТM_TH1           0.00000000  Rejected
## CD4_ТM_TH17          0.00000000  Rejected
## CD4_ТM_TH17TO1       0.00000000  Rejected
## CD4_ТM_TH2           0.02020202  Rejected
## CD4_ТM_TH22          0.02020202  Rejected
## CD4_ТREG(STAR2)      0.89898990 Confirmed
## CD4_ТREG             0.02020202  Rejected
## CD4_ТREG_TH1         0.00000000  Rejected
## CD4_ТREG_TH17        0.49494949 Tentative
## CD4_ТREG_TH17TO1     0.00000000  Rejected
## CD4_ТREG_TH2         0.02020202  Rejected
## CD4_ТREG_TH22        0.00000000  Rejected
## CD4_ТЕ(STAR2)        0.00000000  Rejected
## CD4_ТЕ               0.00000000  Rejected
## CD4_ТЕ_TH1           0.02020202  Rejected
## CD4_ТЕ_TH17          0.00000000  Rejected
## CD4_ТЕ_TH17TO1       0.00000000  Rejected
## CD4_ТЕ_TH2           0.00000000  Rejected
## CD4_ТЕ_TH22          0.00000000  Rejected
## CD4_ТЕ+226+          0.00000000  Rejected
## CD4_ТЕ+39+           0.00000000  Rejected
## CD4_ТЕ+DR+           0.00000000  Rejected
## CD4_ТЕ+PD-1+         0.00000000  Rejected
## CD4_ТЕ+PD-1+TIGIT-   0.00000000  Rejected
## CD4_ТЕ+PD-1+TIGIT+   0.00000000  Rejected
## CD4_ТЕ+PD-1-TIGIT-   0.00000000  Rejected
## CD4_ТЕ+PD-1-TIGIT+   0.00000000  Rejected
## CD4_ТЕ+TIGIT+        0.00000000  Rejected
## CD4_ТМ               0.50505051 Tentative
## CD8+_(IM STAT)       0.01010101  Rejected
## CD8+_                0.66666667 Tentative
## CD8+_226+            0.31313131 Tentative
## CD8+_39+             0.00000000  Rejected
## CD8+_DR+             0.06060606  Rejected
## CD8+_PD-1+           0.01010101  Rejected
## CD8+_PD-1+TIGIT-     0.00000000  Rejected
## CD8+_PD-1+TIGIT+     0.00000000  Rejected
## CD8+_PD-1-TIGIT-     0.00000000  Rejected
## CD8+_PD-1-TIGIT+     0.00000000  Rejected
## CD8+_TIGIT+          0.00000000  Rejected
## CD8_EMTM             0.57575758 Tentative
## CD8_EMTM+226+        0.49494949 Tentative
## CD8_EMTM+39+         0.00000000  Rejected
## CD8_EMTM+DR+         0.73737374 Confirmed
## CD8_EMTM+PD-1+       0.66666667 Tentative
## CD8_EMTM+PD-1+TIGIT- 0.00000000  Rejected
## CD8_EMTM+PD-1+TIGIT+ 0.62626263 Tentative
## CD8_EMTM+PD-1-TIGIT- 0.14141414  Rejected
## CD8_EMTM+PD-1-TIGIT+ 0.49494949 Tentative
## CD8_EMTM+TIGIT+      0.76767677 Confirmed
## CD8_NV(STAR2)        0.01010101  Rejected
## CD8_NV               0.00000000  Rejected
## CD8_NV+226+          0.01010101  Rejected
## CD8_NV+39+           0.01010101  Rejected
## CD8_NV+DR+           0.00000000  Rejected
## CD8_NV+PD-1+         0.00000000  Rejected
## CD8_NV+PD-1+TIGIT-   0.00000000  Rejected
## CD8_NV+PD-1+TIGIT+   0.00000000  Rejected
## CD8_NV+PD-1-TIGIT-   0.00000000  Rejected
## CD8_NV+PD-1-TIGIT+   0.00000000  Rejected
## CD8_NV+TIGIT+        0.00000000  Rejected
## CD8_TREG             0.79797980 Confirmed
## CD8_ЕМ               0.00000000  Rejected
## CD8_СМ(STAR2)        0.23232323  Rejected
## CD8_СМ               0.00000000  Rejected
## CD8_СМ+226+          0.00000000  Rejected
## CD8_СМ+39+           0.00000000  Rejected
## CD8_СМ+DR+           0.01010101  Rejected
## CD8_СМ+PD-1+         0.00000000  Rejected
## CD8_СМ+PD-1+TIGIT-   0.00000000  Rejected
## CD8_СМ+PD-1+TIGIT+   0.00000000  Rejected
## CD8_СМ+PD-1-TIGIT-   0.00000000  Rejected
## CD8_СМ+PD-1-TIGIT+   0.00000000  Rejected
## CD8_СМ+TIGIT+        0.01010101  Rejected
## CD8_ТЕ(STAR2)        0.00000000  Rejected
## CD8_ТЕ               0.43434343 Tentative
## CD8_ТЕ+226+          0.00000000  Rejected
## CD8_ТЕ+39+           0.00000000  Rejected
## CD8_ТЕ+DR+           0.00000000  Rejected
## CD8_ТЕ+PD-1+         0.00000000  Rejected
## CD8_ТЕ+PD-1+TIGIT-   0.00000000  Rejected
## CD8_ТЕ+PD-1+TIGIT+   0.00000000  Rejected
## CD8_ТЕ+PD-1-TIGIT-   0.00000000  Rejected
## CD8_ТЕ+PD-1-TIGIT+   0.00000000  Rejected
## CD8_ТЕ+TIGIT+        0.01010101  Rejected
## CD8_ТМ               0.00000000  Rejected
## TREG_CM              0.00000000  Rejected
## TREG_EMTM            0.35353535 Tentative
## TREG_NV              0.00000000  Rejected
## TREG_TE              0.71717172 Confirmed
## TREG+226+            0.00000000  Rejected
## TREG+226+TIGIT-      0.00000000  Rejected
## TREG+226+TIGIT+      0.01010101  Rejected
## TREG+226-TIGIT-      0.00000000  Rejected
## TREG+226-TIGIT+      0.00000000  Rejected
## TREG+39+             0.00000000  Rejected
## TREG+DR+             0.02020202  Rejected
## TREG+PD-1+           0.00000000  Rejected
## TREG+TIGIT+          0.44444444 Tentative
# Extract important variables
important_vars <- getSelectedAttributes(boruta_result, withTentative = FALSE)

# Extract tentative variables
tentative_vars <- getSelectedAttributes(boruta_result, withTentative = TRUE)
only_tentative <- setdiff(tentative_vars, important_vars)

cat("Confirmed important variables:\n", paste(important_vars, collapse = ", "), "\n\n")
## Confirmed important variables:
##  CD4+_(IM STAT), CD4+_, CD4_NV+PD-1+TIGIT+, CD4_ЕМ_TH1, CD4_ЕМ_TH17TO1, CD4_СМ_TH2, CD4_СМ_TH22, CD4_ТREG(STAR2), CD8_EMTM+DR+, CD8_EMTM+TIGIT+, CD8_TREG, TREG_TE
cat("Tentative variables:\n", paste(only_tentative, collapse = ", "), "\n")
## Tentative variables:
##  CD4_TH22, CD4+_226+, CD4_NV+TIGIT+, CD4_СМ_TH1, CD4_СМ_TH17, CD4_ТREG_TH17, CD4_ТМ, CD8+_, CD8+_226+, CD8_EMTM, CD8_EMTM+226+, CD8_EMTM+PD-1+, CD8_EMTM+PD-1+TIGIT+, CD8_EMTM+PD-1-TIGIT+, CD8_ТЕ, TREG_EMTM, TREG+TIGIT+

3.3.4.1. Selected features analysis

selected_columns <- c(
  "CD4+_(IM STAT)", "CD4+_", "CD4_NV+PD-1+TIGIT+", "CD4_ЕМ_TH1", 
  "CD4_ЕМ_TH17TO1", "CD4_СМ_TH2", "CD4_СМ_TH22", "CD4_ТREG(STAR2)", 
  "CD8+_", "CD8_EMTM+DR+", "CD8_EMTM+TIGIT+", "CD8_TREG", "TREG_TE", 
  "Subject_ID", "Observation_Days_Count", "cGVHD_Diagnosis_Day", 
  "cGVHD_flag", "Blood_test_day"
)

df_cGVHD_prediction_selected <- df_cGVHD_prediction %>%
  select(all_of(selected_columns))

# Add additional columns 
additional_columns <- c(
  "CD4_NV+TIGIT+", "CD4_СМ_TH1", "CD4_СМ_TH17", "CD4_ТREG_TH17", 
  "CD4_ТМ", "CD8+_DR+", "CD8_EMTM", "CD8_EMTM+226+", 
  "CD8_EMTM+PD-1+", "CD8_EMTM+PD-1+TIGIT+", "CD8_EMTM+PD-1-TIGIT+", 
  "CD8_ТЕ", "TREG+226+TIGIT+", "TREG+DR+", "TREG+TIGIT+"
)

df_cGVHD_prediction_selected_plus <- df_cGVHD_prediction %>%
  select(all_of(c(selected_columns, additional_columns)))

Clusterisation features

df_normalized <- df_cGVHD_prediction_selected_plus %>%
  filter(Blood_test_day %in% c("90", "180", "365", "cGVHD")) %>%
  # Ensure cGVHD_flag is retained for later use
  select(cGVHD_flag, where(is.numeric), -Observation_Days_Count, -cGVHD_Diagnosis_Day) %>%
  mutate(across(where(is.numeric), ~ (.-mean(.)) / sd(.), .names = "z_{col}"))

# Save cGVHD_flag for later use
cGVHD_flags <- df_normalized$cGVHD_flag

# Perform near-zero variance filtering (excluding cGVHD_flag)
nzv <- nearZeroVar(df_normalized %>% select(-cGVHD_flag))
df_filtered <- df_normalized %>%
  select(-nzv, cGVHD_flag) %>% # Retain cGVHD_flag after filtering
  filter(complete.cases(.))

# Perform PCA (exclude cGVHD_flag from PCA computation)
pca_results <- prcomp(df_filtered %>% select(-cGVHD_flag), scale. = TRUE)

# Attach PCA scores and cGVHD_flag back together
pca_scores <- as.data.frame(pca_results$x) %>%
  mutate(cGVHD_flag = cGVHD_flags)

# Explained variance calculations
explained_variance <- pca_results$sdev^2 / sum(pca_results$sdev^2)
cumulative_variance <- cumsum(explained_variance)

# Create a data frame for plotting explained variance
explained_variance_data <- data.frame(
  PC = 1:length(explained_variance),
  Explained_Variance = explained_variance,
  Cumulative_Variance = cumulative_variance
)

# Filter for components contributing to the first 80% of cumulative variance
explained_variance_data_filtered <- explained_variance_data %>%
  filter(Cumulative_Variance <= 0.9)

# Add percentage labels for explained variance
explained_variance_data_filtered <- explained_variance_data_filtered %>%
  mutate(Variance_Percentage_Label = paste0(round(Explained_Variance * 100, 2), "%"))

# Plot explained variance with percentage labels
ggplot(explained_variance_data_filtered, aes(x = PC)) +
  geom_bar(aes(y = Explained_Variance), stat = "identity", fill = "steelblue") +
  geom_text(aes(y = Explained_Variance, label = Variance_Percentage_Label), 
            vjust = -0.5, size = 3.5) +
  geom_line(aes(y = Cumulative_Variance), color = "red", size = 1) +
  geom_point(aes(y = Cumulative_Variance), color = "red", size = 2) +
  scale_y_continuous(
    name = "Variance Explained",
    sec.axis = sec_axis(~., name = "Cumulative Variance Explained")
  ) +
  labs(
    title = "Explained Variance by Principal Components (First 80%)",
    x = "Principal Component"
  ) +
  theme_minimal(base_size = 14)

# Perform clustering on PCA scores
set.seed(123)
wss <- sapply(1:10, function(k) {
  kmeans(pca_scores[, 1:10], centers = k, nstart = 25)$tot.withinss
})

# Plot Elbow Method
elbow_plot <- data.frame(Clusters = 1:10, WSS = wss)
ggplot(elbow_plot, aes(x = Clusters, y = WSS)) +
  geom_line() +
  geom_point(size = 3) +
  labs(
    title = "Elbow Method for Optimal Clusters",
    x = "Number of Clusters",
    y = "Total Within-Cluster Sum of Squares (WSS)"
  ) +
  theme_minimal(base_size = 14)

# Apply K-means clustering
optimal_clusters <- 4
kmeans_result <- kmeans(pca_scores[, 1:10], centers = optimal_clusters, nstart = 25)
pca_scores$Cluster <- as.factor(kmeans_result$cluster)

# Visualize clusters with coloring by cGVHD_flag
ggplot(pca_scores, aes(x = PC1, y = PC2, color = as.factor(cGVHD_flag))) +
  geom_point(size = 2, alpha = 0.8) +  # Scatterplot of PCA points
  stat_ellipse(aes(group = Cluster), type = "t", linetype = "dashed", size = 1, color = "black") +  # Ellipses for clusters
  scale_color_brewer(palette = "Set1") +  # Color palette for cGVHD_flag
  labs(
    title = "PCA Clusters with cGVHD_flag Coloring and Cluster Ellipses",
    x = "PC1 (Principal Component 1)",
    y = "PC2 (Principal Component 2)",
    color = "cGVHD_flag"
  ) +
  theme_minimal(base_size = 14) +
  theme(
    legend.position = "bottom",
    plot.title = element_text(face = "bold", hjust = 0.5)
  )

# Group by Cluster and cGVHD_flag to count occurrences
cluster_cGVHD_counts <- pca_scores %>%
  group_by(Cluster, cGVHD_flag) %>%
  summarise(Count = n(), .groups = "drop") %>%
  arrange(Cluster, cGVHD_flag)

# View the summarized table
print(cluster_cGVHD_counts)
## # A tibble: 8 × 3
##   Cluster cGVHD_flag Count
##   <fct>   <fct>      <int>
## 1 1       0              3
## 2 1       1             25
## 3 2       0              3
## 4 2       1              1
## 5 3       0             19
## 6 3       1             11
## 7 4       0             55
## 8 4       1             55

3.3.5. Random Forest over Survival (90 days)

3.3.5.1. All Visits

df_cGVHD_prediction_survival <- df_cGVHD_prediction %>%
  mutate(
    cGVHD_Diagnosis_Day = ifelse(is.na(cGVHD_Diagnosis_Day), Observation_Days_Count, cGVHD_Diagnosis_Day),
    
    Blood_test_day = ifelse(Blood_test_day == "cGVHD", cGVHD_Diagnosis_Day, Blood_test_day),
    Blood_test_day = as.numeric(as.character(Blood_test_day)),
    cGVHD_flag = as.numeric(as.character(cGVHD_flag))
  ) %>%
  select(-Observation_Days_Count) %>%
  select(-Subject_ID)

# Define the dataset
dataSet <- df_cGVHD_prediction_survival %>%
  filter(complete.cases(.))  # Ensure no missing data
# Function to evaluate OOB Requested performance error for different numbers of trees
evaluate_trees <- function(data, formula, max_trees = 1000, step = 50) {
  results <- data.frame(Trees = integer(), OOB_Error = numeric())
  
  for (ntree in seq(50, max_trees, by = step)) {
    set.seed(123)  # Ensure reproducibility
    rf_model <- rfsrc(
      formula = formula,
      data = data,
      mtry = round(sqrt(ncol(data) - 2)),
      nodesize = 15,
      ntree = ntree,  # Number of trees
      importance = TRUE
    )
    
    # Extract the OOB Requested performance error
    oob_error <- rf_model$err.rate[length(rf_model$err.rate)]
    results <- rbind(results, data.frame(
      Trees = ntree,
      OOB_Error = oob_error
    ))
  }
  
  return(results)
}

# Apply the function to your dataset
oob_results <- evaluate_trees(
  data = dataSet,  # Replace with your dataset
  formula = Surv(cGVHD_Diagnosis_Day, cGVHD_flag) ~ .,
  max_trees = 1000,
  step = 50
)

ggplot(oob_results, aes(x = Trees, y = OOB_Error)) +
  geom_line() +
  geom_point(size = 2) +
  labs(
    title = "OOB Requested Performance Error vs. Number of Trees",
    x = "Number of Trees",
    y = "OOB Requested Performance Error"
  ) +
  theme_minimal()

# Find the number of trees with the minimum OOB error
best_trees <- oob_results$Trees[which.min(oob_results$OOB_Error)]

print(paste("Best number of trees:", best_trees))
## [1] "Best number of trees: 500"
# Build the Random Survival Forest model
RF_obj <- rfsrc(Surv(cGVHD_Diagnosis_Day,cGVHD_flag)~.,
                dataSet,
                ntree = best_trees,
                mtry = round(sqrt(ncol(dataSet) - 2)),
                nodesize = 15,
                membership = TRUE,
                importance=TRUE)

# Print the Random Survival Forest object
print(RF_obj)
##                          Sample size: 172
##                     Number of deaths: 92
##                      Number of trees: 500
##            Forest terminal node size: 15
##        Average no. of terminal nodes: 9.864
## No. of variables tried at each split: 13
##               Total no. of variables: 165
##        Resampling used to grow trees: swor
##     Resample size used to grow trees: 109
##                             Analysis: RSF
##                               Family: surv
##                       Splitting rule: logrank *random*
##        Number of random split points: 10
##                           (OOB) CRPS: 74.62375368
##                    (OOB) stand. CRPS: 0.1486529
##    (OOB) Requested performance error: 0.33273494
# Create a hypothetical observation for prediction
newdata <- data.frame(lapply(1:ncol(RF_obj$xvar),function(i){median(RF_obj$xvar[,i])}))
colnames(newdata) <- RF_obj$xvar.names

newdata [,which(RF_obj$xvar.names == "peak_vo2")] <- quantile(RF_obj$xvar$peak_vo2, 0.25)

# Predict survival for the hypothetical observation
y.pred <- predict(RF_obj, newdata = rbind(newdata, RF_obj$xvar)[1, ])
par(cex.axis = 1.0, cex.lab = 1.0, cex.main = 1.0, mar = c(6.0,6,1,1), mgp = c(4, 1, 0))
# Plot predicted survival probability over time
plot(
  round(y.pred$time.interest, 2), 
  y.pred$survival[1, ], 
  type = "l", 
  xlab = "Time (Days)", 
  ylab = "Survival Probability", 
  col = 1, 
  lty = 1, 
  lwd = 2, 
  main = "Predicted Survival Curve"
)

  • The survival probability starts close to 1 (indicating almost all patients are initially disease-free).
  • It declines gradually, indicating that the risk of cGVHD increases over time.
  • Significant drops around certain time points suggest periods where cGVHD occurrences are more frequent.
  • By ~500 days, the survival probability approaches a lower value, reflecting the cumulative risk over time.
# Calculate the Brier score using the Kaplan-Meier censoring distribution
bs.km <- get.brier.survival(RF_obj, cens.mode = "km")$brier.score

# Plot the Brier score
plot(
  bs.km, 
  type = "s", 
  col = 2, 
  ylab = "Brier Score", 
  xlab = "Time (Days)", 
  main = "Brier Score Over Time"
)

  • Initially, the Brier score is low, reflecting good prediction accuracy in the early stages.

  • The score increases over time, indicating reduced prediction accuracy as time progresses.

  • Beyond ~300 days, the Brier score stabilizes.

  • < 0.1 - excellent

  • <= 0.2 - superior

  • <=0.3 - adequate

# Extract Variable Importance (VIMP)
vimp <- RF_obj$importance

# Convert VIMP to a data frame for easy interpretation
vimp_df <- data.frame(
  Variable = names(vimp),
  VIMP = vimp
)

# Sort variables by VIMP in descending order
vimp_df_positive <- vimp_df %>%
  filter(VIMP > 0) %>%
  arrange(desc(VIMP)) %>%
  head(30) 

# Plot Variable Importance
ggplot(vimp_df_positive, aes(x = reorder(Variable, VIMP), y = VIMP)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(
    title = "Variable Importance (VIMP) in RSF",
    x = "Variables",
    y = "VIMP"
  ) +
  theme_minimal(base_size = 8)

# Filter for variables with negative VIMP
vimp_negative <- vimp_df %>%
  filter(VIMP < 0) %>%
  arrange(VIMP)  # Sort in ascending order

# Plot variables with negative VIMP
ggplot(vimp_negative, aes(x = reorder(Variable, VIMP), y = VIMP)) +
  geom_bar(stat = "identity", fill = "red") +
  coord_flip() +
  labs(
    title = "Variables with Negative VIMP in RSF",
    x = "Variables",
    y = "VIMP"
  ) +
  theme_minimal(base_size = 8)

# Compare with VIMP
vimp_boruta_comparison <- vimp_df %>%
  mutate(
    Boruta_Status = case_when(
      Variable %in% important_vars ~ "Confirmed Important",
      Variable %in% tentative_vars ~ "Tentative",
      TRUE ~ "Rejected"
    )
  )

# Filter for top N variables based on absolute VIMP
top_n <- 30  # Adjust this number as needed
vimp_boruta_comparison_filtered <- vimp_boruta_comparison %>%
  arrange(desc(VIMP)) %>%
  head(top_n)

# Plot Comparison for the filtered variables
ggplot(vimp_boruta_comparison_filtered, aes(x = reorder(Variable, VIMP), y = VIMP, fill = Boruta_Status)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(
    title = paste("Top", top_n, "Variables: VIMP and Boruta Comparison"),
    x = "Variables",
    y = "VIMP",
    fill = "Boruta Status"
  ) +
  theme_minimal(base_size = 10) +
  theme(
    legend.position = "bottom",
    plot.title = element_text(face = "bold", hjust = 0.5),
    axis.text.y = element_text(size = 8)
  )

3.3.5.2. 90 days

df_cGVHD_prediction_survival <- df_cGVHD_prediction %>%
  mutate(
    cGVHD_Diagnosis_Day = ifelse(is.na(cGVHD_Diagnosis_Day), Observation_Days_Count, cGVHD_Diagnosis_Day),
    cGVHD_flag = as.numeric(as.character(cGVHD_flag))
  ) %>%
  select(-Observation_Days_Count) %>%
  filter(Blood_test_day %in% c("90")) %>%
  select(-Subject_ID, -Blood_test_day)

# Define the dataset
dataSet <- df_cGVHD_prediction_survival %>%
  filter(complete.cases(.))  # Ensure no missing data
# Function to evaluate OOB Requested performance error for different numbers of trees
evaluate_trees <- function(data, formula, max_trees = 1000, step = 50) {
  results <- data.frame(Trees = integer(), OOB_Error = numeric())
  
  for (ntree in seq(50, max_trees, by = step)) {
    set.seed(123)  # Ensure reproducibility
    rf_model <- rfsrc(
      formula = formula,
      data = data,
      mtry = round(sqrt(ncol(data) - 2)),
      nodesize = 15,
      ntree = ntree,  # Number of trees
      importance = TRUE
    )
    
    # Extract the OOB Requested performance error
    oob_error <- rf_model$err.rate[length(rf_model$err.rate)]
    results <- rbind(results, data.frame(
      Trees = ntree,
      OOB_Error = oob_error
    ))
  }
  
  return(results)
}

# Apply the function to your dataset
oob_results <- evaluate_trees(
  data = dataSet,  # Replace with your dataset
  formula = Surv(cGVHD_Diagnosis_Day, cGVHD_flag) ~ .,
  max_trees = 1000,
  step = 50
)

ggplot(oob_results, aes(x = Trees, y = OOB_Error)) +
  geom_line() +
  geom_point(size = 2) +
  labs(
    title = "OOB Requested Performance Error vs. Number of Trees",
    x = "Number of Trees",
    y = "OOB Requested Performance Error"
  ) +
  theme_minimal()

# Find the number of trees with the minimum OOB error
best_trees <- oob_results$Trees[which.min(oob_results$OOB_Error)]

print(paste("Best number of trees:", best_trees))
## [1] "Best number of trees: 100"
# Build the Random Survival Forest model
RF_obj <- rfsrc(Surv(cGVHD_Diagnosis_Day,cGVHD_flag)~.,
                dataSet,
                ntree = best_trees,
                mtry = round(sqrt(ncol(dataSet) - 2)),
                nodesize = 15,
                membership = TRUE,
                importance=TRUE)

# Print the Random Survival Forest object
print(RF_obj)
##                          Sample size: 67
##                     Number of deaths: 28
##                      Number of trees: 100
##            Forest terminal node size: 15
##        Average no. of terminal nodes: 4.22
## No. of variables tried at each split: 13
##               Total no. of variables: 164
##        Resampling used to grow trees: swor
##     Resample size used to grow trees: 42
##                             Analysis: RSF
##                               Family: surv
##                       Splitting rule: logrank *random*
##        Number of random split points: 10
##                           (OOB) CRPS: 81.34704027
##                    (OOB) stand. CRPS: 0.1620459
##    (OOB) Requested performance error: 0.45494186

A lower CRPS indicates better performance.

# Create a hypothetical observation for prediction
# Creating an hypothetical observation 
newdata <- data.frame(lapply(1:ncol(RF_obj$xvar),function(i){median(RF_obj$xvar[,i])}))
colnames(newdata) <- RF_obj$xvar.names

# Predict survival for the hypothetical observation
y.pred <- predict(RF_obj, newdata = rbind(newdata, RF_obj$xvar)[1, ])

# Plot predicted survival probability over time
par(cex.axis = 1.0, cex.lab = 1.0, cex.main = 1.0, mar = c(6.0, 6, 1, 1), mgp = c(4, 1, 0))
plot(
  round(y.pred$time.interest, 2), 
  y.pred$survival[1, ], 
  type = "l", 
  xlab = "Time (Days)", 
  ylab = "Survival Probability", 
  col = 1, 
  lty = 1, 
  lwd = 2, 
  main = "Predicted Survival Curve"
)

# Calculate the Brier score using the Kaplan-Meier censoring distribution
bs.km <- get.brier.survival(RF_obj, cens.mode = "km")$brier.score

# Plot the Brier score
plot(
  bs.km, 
  type = "s", 
  col = 2, 
  ylab = "Brier Score", 
  xlab = "Time (Days)", 
  main = "Brier Score Over Time"
)

  • < 0.1 - excellent
  • <= 0.2 - superior
  • <=0.3 - adequate
# Extract Variable Importance (VIMP)
vimp <- RF_obj$importance

# Convert VIMP to a data frame for easy interpretation
vimp_df <- data.frame(
  Variable = names(vimp),
  VIMP = vimp
)

# Sort variables by VIMP in descending order
vimp_df_positive <- vimp_df %>%
  filter(VIMP > 0) %>%
  arrange(desc(VIMP))

# Plot Variable Importance
ggplot(vimp_df_positive, aes(x = reorder(Variable, VIMP), y = VIMP)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(
    title = "Variable Importance (VIMP) in RSF",
    x = "Variables",
    y = "VIMP"
  ) +
  theme_minimal(base_size = 5)

# Filter for variables with negative VIMP
vimp_negative <- vimp_df %>%
  filter(VIMP < 0) %>%
  arrange(VIMP)  # Sort in ascending order

# Plot variables with negative VIMP
ggplot(vimp_negative, aes(x = reorder(Variable, VIMP), y = VIMP)) +
  geom_bar(stat = "identity", fill = "red") +
  coord_flip() +
  labs(
    title = "Variables with Negative VIMP in RSF",
    x = "Variables",
    y = "VIMP"
  ) +
  theme_minimal(base_size = 6)

3.3.5.3 180 days

df_cGVHD_prediction_survival <- df_cGVHD_prediction %>%
  mutate(
    cGVHD_Diagnosis_Day = ifelse(is.na(cGVHD_Diagnosis_Day), Observation_Days_Count, cGVHD_Diagnosis_Day),
    cGVHD_flag = as.numeric(as.character(cGVHD_flag))
  ) %>%
  select(-Observation_Days_Count) %>%
  filter(Blood_test_day %in% c("180")) %>%
  select(-Subject_ID, -Blood_test_day)

# Define the dataset
dataSet <- df_cGVHD_prediction_survival %>%
  filter(complete.cases(.))  # Ensure no missing data
# Function to evaluate OOB Requested performance error for different numbers of trees
evaluate_trees <- function(data, formula, max_trees = 1000, step = 50) {
  results <- data.frame(Trees = integer(), OOB_Error = numeric())
  
  for (ntree in seq(50, max_trees, by = step)) {
    set.seed(123)  # Ensure reproducibility
    rf_model <- rfsrc(
      formula = formula,
      data = data,
      mtry = round(sqrt(ncol(data) - 2)),
      nodesize = 15,
      ntree = ntree,  # Number of trees
      importance = TRUE
    )
    
    # Extract the OOB Requested performance error
    oob_error <- rf_model$err.rate[length(rf_model$err.rate)]
    results <- rbind(results, data.frame(
      Trees = ntree,
      OOB_Error = oob_error
    ))
  }
  
  return(results)
}

# Apply the function to your dataset
oob_results <- evaluate_trees(
  data = dataSet,  # Replace with your dataset
  formula = Surv(cGVHD_Diagnosis_Day, cGVHD_flag) ~ .,
  max_trees = 1000,
  step = 50
)

ggplot(oob_results, aes(x = Trees, y = OOB_Error)) +
  geom_line() +
  geom_point(size = 2) +
  labs(
    title = "OOB Requested Performance Error vs. Number of Trees",
    x = "Number of Trees",
    y = "OOB Requested Performance Error"
  ) +
  theme_minimal()

# Find the number of trees with the minimum OOB error
best_trees <- oob_results$Trees[which.min(oob_results$OOB_Error)]

print(paste("Best number of trees:", best_trees))
## [1] "Best number of trees: 350"
# Build the Random Survival Forest model
RF_obj <- rfsrc(Surv(cGVHD_Diagnosis_Day,cGVHD_flag)~.,
                dataSet,
                ntree = best_trees,
                mtry = round(sqrt(ncol(dataSet) - 2)),
                nodesize = 15,
                membership = TRUE,
                importance=TRUE)

# Print the Random Survival Forest object
print(RF_obj)
##                          Sample size: 54
##                     Number of deaths: 26
##                      Number of trees: 350
##            Forest terminal node size: 15
##        Average no. of terminal nodes: 2.9657
## No. of variables tried at each split: 13
##               Total no. of variables: 164
##        Resampling used to grow trees: swor
##     Resample size used to grow trees: 34
##                             Analysis: RSF
##                               Family: surv
##                       Splitting rule: logrank *random*
##        Number of random split points: 10
##                           (OOB) CRPS: 92.42719353
##                    (OOB) stand. CRPS: 0.18411792
##    (OOB) Requested performance error: 0.621139

A lower CRPS indicates better performance.

# Create a hypothetical observation for prediction
# Creating an hypothetical observation 
newdata <- data.frame(lapply(1:ncol(RF_obj$xvar),function(i){median(RF_obj$xvar[,i])}))
colnames(newdata) <- RF_obj$xvar.names

# Predict survival for the hypothetical observation
y.pred <- predict(RF_obj, newdata = rbind(newdata, RF_obj$xvar)[1, ])

# Plot predicted survival probability over time

plot(
  round(y.pred$time.interest, 2), 
  y.pred$survival[1, ], 
  type = "l", 
  xlab = "Time (Days)", 
  ylab = "Survival Probability", 
  col = 1, 
  lty = 1, 
  lwd = 2, 
  main = "Predicted Survival Curve"
)

# Calculate the Brier score using the Kaplan-Meier censoring distribution
bs.km <- get.brier.survival(RF_obj, cens.mode = "km")$brier.score

# Plot the Brier score
plot(
  bs.km, 
  type = "s", 
  col = 2, 
  ylab = "Brier Score", 
  xlab = "Time (Days)", 
  main = "Brier Score Over Time"
)

  • < 0.1 - excellent
  • <= 0.2 - superior
  • <=0.3 - adequate
# Extract Variable Importance (VIMP)
vimp <- RF_obj$importance

# Convert VIMP to a data frame for easy interpretation
vimp_df <- data.frame(
  Variable = names(vimp),
  VIMP = vimp
)

# Sort variables by VIMP in descending order
vimp_df_positive <- vimp_df %>%
  filter(VIMP > 0) %>%
  arrange(desc(VIMP))

# Plot Variable Importance
ggplot(vimp_df_positive, aes(x = reorder(Variable, VIMP), y = VIMP)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(
    title = "Variable Importance (VIMP) in RSF",
    x = "Variables",
    y = "VIMP"
  ) +
  theme_minimal(base_size = 4)

# Filter for variables with negative VIMP
vimp_negative <- vimp_df %>%
  filter(VIMP < 0) %>%
  arrange(VIMP)  # Sort in ascending order

# Plot variables with negative VIMP
ggplot(vimp_negative, aes(x = reorder(Variable, VIMP), y = VIMP)) +
  geom_bar(stat = "identity", fill = "red") +
  coord_flip() +
  labs(
    title = "Variables with Negative VIMP in RSF",
    x = "Variables",
    y = "VIMP"
  ) +
  theme_minimal(base_size = 4)

3.3.5.4 365 days

df_cGVHD_prediction_survival <- df_cGVHD_prediction %>%
  mutate(
    cGVHD_Diagnosis_Day = ifelse(is.na(cGVHD_Diagnosis_Day), Observation_Days_Count, cGVHD_Diagnosis_Day),
    
    Blood_test_day = ifelse(Blood_test_day == "cGVHD", cGVHD_Diagnosis_Day, Blood_test_day),
    Blood_test_day = as.numeric(as.character(Blood_test_day)),
    cGVHD_flag = as.numeric(as.character(cGVHD_flag))
  ) %>%
  select(-Observation_Days_Count) %>%
  filter(Blood_test_day %in% c("365")) %>%
  select(-Subject_ID, -Blood_test_day)

# Define the dataset
dataSet <- df_cGVHD_prediction_survival %>%
  filter(complete.cases(.))  # Ensure no missing data
# Function to evaluate OOB Requested performance error for different numbers of trees
evaluate_trees <- function(data, formula, max_trees = 1000, step = 20) {
  results <- data.frame(Trees = integer(), OOB_Error = numeric())
  
  for (ntree in seq(20, max_trees, by = step)) {
    set.seed(123)  # Ensure reproducibility
    rf_model <- rfsrc(
      formula = formula,
      data = data,
      mtry = round(sqrt(ncol(data) - 2)),
      nodesize = 15,
      ntree = ntree,  # Number of trees
      importance = TRUE
    )
    
    # Extract the OOB Requested performance error
    oob_error <- rf_model$err.rate[length(rf_model$err.rate)]
    results <- rbind(results, data.frame(
      Trees = ntree,
      OOB_Error = oob_error
    ))
  }
  
  return(results)
}

# Apply the function to your dataset
oob_results <- evaluate_trees(
  data = dataSet,  # Replace with your dataset
  formula = Surv(cGVHD_Diagnosis_Day, cGVHD_flag) ~ .,
  max_trees = 1000,
  step = 50
)

ggplot(oob_results, aes(x = Trees, y = OOB_Error)) +
  geom_line() +
  geom_point(size = 2) +
  labs(
    title = "OOB Requested Performance Error vs. Number of Trees",
    x = "Number of Trees",
    y = "OOB Requested Performance Error"
  ) +
  theme_minimal()

# Find the number of trees with the minimum OOB error
best_trees <- oob_results$Trees[which.min(oob_results$OOB_Error)]

print(paste("Best number of trees:", best_trees))
## [1] "Best number of trees: 20"
# Build the Random Survival Forest model
RF_obj <- rfsrc(Surv(cGVHD_Diagnosis_Day,cGVHD_flag)~.,
                dataSet,
                ntree = best_trees,
                mtry = round(sqrt(ncol(dataSet) - 2)),
                nodesize = 15,
                membership = TRUE,
                importance=TRUE)

# Print the Random Survival Forest object
print(RF_obj)
##                          Sample size: 26
##                     Number of deaths: 13
##                      Number of trees: 20
##            Forest terminal node size: 15
##        Average no. of terminal nodes: 1
## No. of variables tried at each split: 13
##               Total no. of variables: 164
##        Resampling used to grow trees: swor
##     Resample size used to grow trees: 16
##                             Analysis: RSF
##                               Family: surv
##                       Splitting rule: logrank *random*
##        Number of random split points: 10
##                           (OOB) CRPS: 89.22882723
##                    (OOB) stand. CRPS: 0.17774667
##    (OOB) Requested performance error: 0.84552846

A lower CRPS indicates better performance.

# Create a hypothetical observation for prediction
# Creating an hypothetical observation 
newdata <- data.frame(lapply(1:ncol(RF_obj$xvar),function(i){median(RF_obj$xvar[,i])}))
colnames(newdata) <- RF_obj$xvar.names

# Predict survival for the hypothetical observation
y.pred <- predict(RF_obj, newdata = rbind(newdata, RF_obj$xvar)[1, ])

# Plot predicted survival probability over time
par(cex.axis = 1.0, cex.lab = 1.0, cex.main = 1.0, mar = c(6.0, 6, 1, 1), mgp = c(4, 1, 0))
plot(
  round(y.pred$time.interest, 2), 
  y.pred$survival[1, ], 
  type = "l", 
  xlab = "Time (Days)", 
  ylab = "Survival Probability", 
  col = 1, 
  lty = 1, 
  lwd = 2, 
  main = "Predicted Survival Curve"
)

# Calculate the Brier score using the Kaplan-Meier censoring distribution
bs.km <- get.brier.survival(RF_obj, cens.mode = "km")$brier.score

# Plot the Brier score
plot(
  bs.km, 
  type = "s", 
  col = 2, 
  ylab = "Brier Score", 
  xlab = "Time (Days)", 
  main = "Brier Score Over Time"
)

  • < 0.1 - excellent
  • <= 0.2 - superior
  • <=0.3 - adequate
# Extract Variable Importance (VIMP)
vimp <- RF_obj$importance

# Convert VIMP to a data frame for easy interpretation
vimp_df <- data.frame(
  Variable = names(vimp),
  VIMP = vimp
)

# Sort variables by VIMP in descending order
vimp_df_positive <- vimp_df %>%
  filter(VIMP > 0) %>%
  arrange(desc(VIMP))

# Plot Variable Importance
ggplot(vimp_df_positive, aes(x = reorder(Variable, VIMP), y = VIMP)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(
    title = "Variable Importance (VIMP) in RSF",
    x = "Variables",
    y = "VIMP"
  ) +
  theme_minimal(base_size = 5)

# Filter for variables with negative VIMP
vimp_negative <- vimp_df %>%
  filter(VIMP < 0) %>%
  arrange(VIMP)  # Sort in ascending order

# Plot variables with negative VIMP
ggplot(vimp_negative, aes(x = reorder(Variable, VIMP), y = VIMP)) +
  geom_bar(stat = "identity", fill = "red") +
  coord_flip() +
  labs(
    title = "Variables with Negative VIMP in RSF",
    x = "Variables",
    y = "VIMP"
  ) +
  theme_minimal(base_size = 6)

3.3.5.5 cGVHD

df_cGVHD_prediction_survival <- df_cGVHD_prediction %>%
  mutate(
    cGVHD_Diagnosis_Day = ifelse(is.na(cGVHD_Diagnosis_Day), Observation_Days_Count, cGVHD_Diagnosis_Day),
    
    cGVHD_flag = as.numeric(as.character(cGVHD_flag))
  ) %>%
  select(-Observation_Days_Count) %>%
  filter(Blood_test_day %in% c("cGVHD")) %>%
  select(-Subject_ID, -Blood_test_day)

# Define the dataset
dataSet <- df_cGVHD_prediction_survival %>%
  filter(complete.cases(.))  # Ensure no missing data
# Function to evaluate OOB Requested performance error for different numbers of trees
evaluate_trees <- function(data, formula, max_trees = 1000, step = 20) {
  results <- data.frame(Trees = integer(), OOB_Error = numeric())
  
  for (ntree in seq(10, max_trees, by = step)) {
    set.seed(123)  # Ensure reproducibility
    rf_model <- rfsrc(
      formula = formula,
      data = data,
      mtry = round(sqrt(ncol(data) - 2)),
      nodesize = 15,
      ntree = ntree,  # Number of trees
      importance = TRUE
    )
    
    # Extract the OOB Requested performance error
    oob_error <- rf_model$err.rate[length(rf_model$err.rate)]
    results <- rbind(results, data.frame(
      Trees = ntree,
      OOB_Error = oob_error
    ))
  }
  
  return(results)
}

# Apply the function to your dataset
oob_results <- evaluate_trees(
  data = dataSet,  # Replace with your dataset
  formula = Surv(cGVHD_Diagnosis_Day, cGVHD_flag) ~ .,
  max_trees = 1000,
  step = 50
)

ggplot(oob_results, aes(x = Trees, y = OOB_Error)) +
  geom_line() +
  geom_point(size = 2) +
  labs(
    title = "OOB Requested Performance Error vs. Number of Trees",
    x = "Number of Trees",
    y = "OOB Requested Performance Error"
  ) +
  theme_minimal()

# Find the number of trees with the minimum OOB error
best_trees <- oob_results$Trees[which.min(oob_results$OOB_Error)]

print(paste("Best number of trees:", best_trees))
## [1] "Best number of trees: 10"
# Build the Random Survival Forest model
RF_obj <- rfsrc(Surv(cGVHD_Diagnosis_Day,cGVHD_flag)~.,
                dataSet,
                ntree = best_trees,
                mtry = round(sqrt(ncol(dataSet) - 2)),
                nodesize = 15,
                membership = TRUE,
                importance=TRUE)

# Print the Random Survival Forest object
print(RF_obj)
##                          Sample size: 25
##                     Number of deaths: 25
##                      Number of trees: 10
##            Forest terminal node size: 15
##        Average no. of terminal nodes: 1
## No. of variables tried at each split: 13
##               Total no. of variables: 164
##        Resampling used to grow trees: swor
##     Resample size used to grow trees: 16
##                             Analysis: RSF
##                               Family: surv
##                       Splitting rule: logrank *random*
##        Number of random split points: 10
##                           (OOB) CRPS: 39.19364759
##                    (OOB) stand. CRPS: 0.078075
##    (OOB) Requested performance error: 0.64166667

A lower CRPS indicates better performance.

# Create a hypothetical observation for prediction
# Creating an hypothetical observation 
newdata <- data.frame(lapply(1:ncol(RF_obj$xvar),function(i){median(RF_obj$xvar[,i])}))
colnames(newdata) <- RF_obj$xvar.names

# Predict survival for the hypothetical observation
y.pred <- predict(RF_obj, newdata = rbind(newdata, RF_obj$xvar)[1, ])

# Plot predicted survival probability over time
par(cex.axis = 1.0, cex.lab = 1.0, cex.main = 1.0, mar = c(6.0, 6, 1, 1), mgp = c(4, 1, 0))
plot(
  round(y.pred$time.interest, 2), 
  y.pred$survival[1, ], 
  type = "l", 
  xlab = "Time (Days)", 
  ylab = "Survival Probability", 
  col = 1, 
  lty = 1, 
  lwd = 2, 
  main = "Predicted Survival Curve"
)

# Calculate the Brier score using the Kaplan-Meier censoring distribution
bs.km <- get.brier.survival(RF_obj, cens.mode = "km")$brier.score

# Plot the Brier score
plot(
  bs.km, 
  type = "s", 
  col = 2, 
  ylab = "Brier Score", 
  xlab = "Time (Days)", 
  main = "Brier Score Over Time"
)

  • < 0.1 - excellent
  • <= 0.2 - superior
  • <=0.3 - adequate
# Extract Variable Importance (VIMP)
vimp <- RF_obj$importance

# Convert VIMP to a data frame for easy interpretation
vimp_df <- data.frame(
  Variable = names(vimp),
  VIMP = vimp
)

# Sort variables by VIMP in descending order
vimp_df_positive <- vimp_df %>%
  filter(VIMP > 0) %>%
  arrange(desc(VIMP))

# Plot Variable Importance
ggplot(vimp_df_positive, aes(x = reorder(Variable, VIMP), y = VIMP)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(
    title = "Variable Importance (VIMP) in RSF",
    x = "Variables",
    y = "VIMP"
  ) +
  theme_minimal(base_size = 5)

# Filter for variables with negative VIMP
vimp_negative <- vimp_df %>%
  filter(VIMP < 0) %>%
  arrange(VIMP)  # Sort in ascending order

# Plot variables with negative VIMP
ggplot(vimp_negative, aes(x = reorder(Variable, VIMP), y = VIMP)) +
  geom_bar(stat = "identity", fill = "red") +
  coord_flip() +
  labs(
    title = "Variables with Negative VIMP in RSF",
    x = "Variables",
    y = "VIMP"
  ) +
  theme_minimal(base_size = 6)

4. Diagnostics of cGVHD

Comparison on the day of the onset of chronic GVHD with the closest points in terms of timing in patients without chronic GVHD.

Group 1: Patients diagnosed with cGVHD (cGVHD_flag = 1). Group 2: Patients without cGVHD (cGVHD_flag = 0).

Due to large ammount of immune cell markers (332):

  1. Cluster immune cell markers to identify groups of similar variables.
  2. After clustering, perform statistical comparisons for clasters.

4.1. Define Data

# Plot the density plot for cGVHD Diagnosis Day
df_transformed %>%
  filter(cGVHD_flag == 1) %>%
  ggplot(aes(x = cGVHD_Diagnosis_Day)) +
  geom_density(fill = "blue", alpha = 0.7) +
  theme_minimal() +
  labs(
    title = "Distribution of cGVHD Diagnosis Day",
    x = "cGVHD Diagnosis Day",
    y = "Density"
  )

5. Immunological Profiles of Patients with or without cGVHD